aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extras/mini-os/include/posix/net/if.h85
-rw-r--r--extras/mini-os/lib/sys.c7
-rw-r--r--tools/Makefile1
-rw-r--r--tools/examples/Makefile58
-rw-r--r--tools/firmware/hvmloader/config.h1
-rw-r--r--tools/firmware/hvmloader/hvmloader.c192
-rw-r--r--tools/firmware/rombios/rombios.c23
-rw-r--r--tools/hotplug/Linux/Makefile97
-rw-r--r--tools/hotplug/Linux/blktap (renamed from tools/examples/blktap)0
-rw-r--r--tools/hotplug/Linux/block (renamed from tools/examples/block)0
-rw-r--r--tools/hotplug/Linux/block-common.sh (renamed from tools/examples/block-common.sh)0
-rw-r--r--[-rwxr-xr-x]tools/hotplug/Linux/block-enbd (renamed from tools/examples/block-enbd)0
-rw-r--r--tools/hotplug/Linux/block-nbd (renamed from tools/examples/block-nbd)0
-rw-r--r--tools/hotplug/Linux/external-device-migrate (renamed from tools/examples/external-device-migrate)0
-rw-r--r--tools/hotplug/Linux/init.d/sysconfig.xendomains (renamed from tools/examples/init.d/sysconfig.xendomains)0
-rwxr-xr-xtools/hotplug/Linux/init.d/xend (renamed from tools/examples/init.d/xend)0
-rw-r--r--tools/hotplug/Linux/init.d/xendomains (renamed from tools/examples/init.d/xendomains)0
-rw-r--r--tools/hotplug/Linux/locking.sh (renamed from tools/examples/locking.sh)0
-rw-r--r--tools/hotplug/Linux/logging.sh (renamed from tools/examples/logging.sh)0
-rw-r--r--[-rwxr-xr-x]tools/hotplug/Linux/network-bridge (renamed from tools/examples/network-bridge)0
-rw-r--r--tools/hotplug/Linux/network-nat (renamed from tools/examples/network-nat)0
-rw-r--r--[-rwxr-xr-x]tools/hotplug/Linux/network-route (renamed from tools/examples/network-route)0
-rw-r--r--[-rwxr-xr-x]tools/hotplug/Linux/vif-bridge (renamed from tools/examples/vif-bridge)0
-rw-r--r--tools/hotplug/Linux/vif-common.sh (renamed from tools/examples/vif-common.sh)0
-rw-r--r--tools/hotplug/Linux/vif-nat (renamed from tools/examples/vif-nat)0
-rw-r--r--[-rwxr-xr-x]tools/hotplug/Linux/vif-route (renamed from tools/examples/vif-route)0
-rw-r--r--tools/hotplug/Linux/vscsi (renamed from tools/examples/vscsi)0
-rw-r--r--tools/hotplug/Linux/vtpm (renamed from tools/examples/vtpm)0
-rw-r--r--tools/hotplug/Linux/vtpm-common.sh (renamed from tools/examples/vtpm-common.sh)0
-rw-r--r--tools/hotplug/Linux/vtpm-delete (renamed from tools/examples/vtpm-delete)0
-rw-r--r--tools/hotplug/Linux/vtpm-hotplug-common.sh (renamed from tools/examples/vtpm-hotplug-common.sh)0
-rw-r--r--tools/hotplug/Linux/vtpm-impl (renamed from tools/examples/vtpm-impl)0
-rw-r--r--tools/hotplug/Linux/vtpm-migration.sh (renamed from tools/examples/vtpm-migration.sh)0
-rw-r--r--[-rwxr-xr-x]tools/hotplug/Linux/xen-backend.agent (renamed from tools/examples/xen-backend.agent)0
-rw-r--r--tools/hotplug/Linux/xen-backend.rules (renamed from tools/examples/xen-backend.rules)0
-rw-r--r--tools/hotplug/Linux/xen-hotplug-cleanup (renamed from tools/examples/xen-hotplug-cleanup)0
-rw-r--r--tools/hotplug/Linux/xen-hotplug-common.sh (renamed from tools/examples/xen-hotplug-common.sh)0
-rw-r--r--tools/hotplug/Linux/xen-network-common.sh (renamed from tools/examples/xen-network-common.sh)0
-rw-r--r--tools/hotplug/Linux/xen-script-common.sh (renamed from tools/examples/xen-script-common.sh)0
-rw-r--r--tools/hotplug/Makefile9
-rw-r--r--tools/hotplug/NetBSD/Makefile41
-rw-r--r--tools/hotplug/NetBSD/block-nbsd88
-rw-r--r--tools/hotplug/NetBSD/qemu-ifup-nbsd3
-rw-r--r--tools/hotplug/NetBSD/vif-bridge-nbsd35
-rw-r--r--tools/hotplug/NetBSD/vif-ip-nbsd33
-rw-r--r--tools/hotplug/common/Makefile37
-rw-r--r--tools/libxc/xc_cpufeature.h1
-rw-r--r--tools/libxc/xc_cpuid_x86.c3
-rw-r--r--tools/misc/xenpm.c2
-rw-r--r--tools/python/xen/util/rwlock.py137
-rw-r--r--tools/python/xen/xend/XendAPI.py2
-rw-r--r--tools/python/xen/xend/XendConfig.py2
-rw-r--r--tools/python/xen/xend/XendDomain.py29
-rw-r--r--tools/python/xen/xend/XendDomainInfo.py220
-rw-r--r--tools/python/xen/xend/osdep.py29
-rw-r--r--tools/python/xen/xend/server/pciif.py47
-rw-r--r--tools/python/xen/xm/create.py5
-rw-r--r--unmodified_drivers/linux-2.6/balloon/Kbuild5
-rwxr-xr-xunmodified_drivers/linux-2.6/mkbuildtree1
-rw-r--r--unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c4
-rw-r--r--unmodified_drivers/linux-2.6/platform-pci/platform-compat.c4
-rw-r--r--xen/Rules.mk3
-rw-r--r--xen/arch/x86/Makefile1
-rw-r--r--xen/arch/x86/acpi/boot.c4
-rw-r--r--xen/arch/x86/acpi/power.c35
-rw-r--r--xen/arch/x86/copy_page.S66
-rw-r--r--xen/arch/x86/cpu/common.c9
-rw-r--r--xen/arch/x86/cpu/mcheck/p4.c2
-rw-r--r--xen/arch/x86/domain.c39
-rw-r--r--xen/arch/x86/domain_build.c39
-rw-r--r--xen/arch/x86/domctl.c13
-rw-r--r--xen/arch/x86/hpet.c17
-rw-r--r--xen/arch/x86/hvm/hvm.c43
-rw-r--r--xen/arch/x86/hvm/mtrr.c16
-rw-r--r--xen/arch/x86/hvm/svm/svm.c24
-rw-r--r--xen/arch/x86/hvm/vioapic.c4
-rw-r--r--xen/arch/x86/hvm/vlapic.c36
-rw-r--r--xen/arch/x86/hvm/vmsi.c2
-rw-r--r--xen/arch/x86/hvm/vmx/intr.c4
-rw-r--r--xen/arch/x86/hvm/vmx/realmode.c23
-rw-r--r--xen/arch/x86/hvm/vmx/vmcs.c5
-rw-r--r--xen/arch/x86/hvm/vmx/vmx.c77
-rw-r--r--xen/arch/x86/hvm/vmx/vpmu_core2.c2
-rw-r--r--xen/arch/x86/i8259.c2
-rw-r--r--xen/arch/x86/io_apic.c24
-rw-r--r--xen/arch/x86/irq.c53
-rw-r--r--xen/arch/x86/mm.c98
-rw-r--r--xen/arch/x86/mm/Makefile6
-rw-r--r--xen/arch/x86/mm/guest_walk.c260
-rw-r--r--xen/arch/x86/mm/hap/guest_walk.c173
-rw-r--r--xen/arch/x86/mm/hap/private.h34
-rw-r--r--xen/arch/x86/mm/p2m.c2
-rw-r--r--xen/arch/x86/mm/page-guest32.h100
-rw-r--r--xen/arch/x86/mm/shadow/multi.c399
-rw-r--r--xen/arch/x86/mm/shadow/types.h203
-rw-r--r--xen/arch/x86/msi.c39
-rw-r--r--xen/arch/x86/setup.c8
-rw-r--r--xen/arch/x86/smpboot.c17
-rw-r--r--xen/arch/x86/time.c134
-rw-r--r--xen/arch/x86/traps.c11
-rw-r--r--xen/arch/x86/x86_32/mm.c24
-rw-r--r--xen/arch/x86/x86_64/mm.c19
-rw-r--r--xen/common/kernel.c3
-rw-r--r--xen/drivers/passthrough/io.c4
-rw-r--r--xen/drivers/passthrough/pci.c6
-rw-r--r--xen/drivers/passthrough/vtd/dmar.c4
-rw-r--r--xen/drivers/passthrough/vtd/ia64/vtd.c5
-rw-r--r--xen/drivers/passthrough/vtd/intremap.c33
-rw-r--r--xen/drivers/passthrough/vtd/iommu.c30
-rw-r--r--xen/drivers/passthrough/vtd/qinval.c2
-rw-r--r--xen/drivers/passthrough/vtd/vtd.h2
-rw-r--r--xen/drivers/passthrough/vtd/x86/vtd.c11
-rw-r--r--xen/include/asm-ia64/hvm/irq.h2
-rw-r--r--xen/include/asm-ia64/linux/asm/irq.h1
-rw-r--r--xen/include/asm-x86/acpi.h3
-rw-r--r--xen/include/asm-x86/config.h6
-rw-r--r--xen/include/asm-x86/cpufeature.h1
-rw-r--r--xen/include/asm-x86/domain.h3
-rw-r--r--xen/include/asm-x86/guest_pt.h291
-rw-r--r--xen/include/asm-x86/hpet.h4
-rw-r--r--xen/include/asm-x86/hvm/irq.h1
-rw-r--r--xen/include/asm-x86/hvm/vlapic.h3
-rw-r--r--xen/include/asm-x86/hvm/vmx/vmx.h6
-rw-r--r--xen/include/asm-x86/irq.h2
-rw-r--r--xen/include/asm-x86/mach-default/irq_vectors.h4
-rw-r--r--xen/include/asm-x86/mm.h1
-rw-r--r--xen/include/asm-x86/msi.h8
-rw-r--r--xen/include/asm-x86/page.h6
-rw-r--r--xen/include/asm-x86/perfc_defn.h2
-rw-r--r--xen/include/asm-x86/pirq.h11
-rw-r--r--xen/include/asm-x86/x86_32/page.h2
-rw-r--r--xen/include/asm-x86/x86_64/page.h2
-rw-r--r--xen/include/public/features.h6
-rw-r--r--xen/include/public/grant_table.h9
-rw-r--r--xen/include/public/io/pciif.h35
-rw-r--r--xen/include/public/kexec.h21
-rw-r--r--xen/include/xen/hvm/irq.h2
-rw-r--r--xen/include/xen/hypercall.h6
-rw-r--r--xen/include/xen/iommu.h4
-rw-r--r--xen/include/xen/irq.h11
-rw-r--r--xen/include/xen/kexec.h21
141 files changed, 2368 insertions, 1477 deletions
diff --git a/extras/mini-os/include/posix/net/if.h b/extras/mini-os/include/posix/net/if.h
new file mode 100644
index 0000000000..5be77d4f49
--- /dev/null
+++ b/extras/mini-os/include/posix/net/if.h
@@ -0,0 +1,85 @@
+/*
+ * This code is mostly taken from NetBSD net/if.h
+ * Changes: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
+ *
+ ******************************************************************************
+ *
+ * Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by William Studenmund and Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _NET_IF_H_
+#define _NET_IF_H_
+
+/*
+ * Length of interface external name, including terminating '\0'.
+ * Note: this is the same size as a generic device's external name.
+ */
+#define IF_NAMESIZE 16
+
+struct if_nameindex {
+ unsigned int if_index; /* 1, 2, ... */
+ char *if_name; /* null terminated name: "le0", ... */
+};
+
+unsigned int if_nametoindex(const char *);
+char * if_indextoname(unsigned int, char *);
+struct if_nameindex * if_nameindex(void);
+void if_freenameindex(struct if_nameindex *);
+
+#endif /* !_NET_IF_H_ */
+
diff --git a/extras/mini-os/lib/sys.c b/extras/mini-os/lib/sys.c
index 34e4fb6666..a07692b883 100644
--- a/extras/mini-os/lib/sys.c
+++ b/extras/mini-os/lib/sys.c
@@ -34,6 +34,7 @@
#include <sys/unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
+#include <net/if.h>
#include <time.h>
#include <errno.h>
#include <fcntl.h>
@@ -1324,6 +1325,12 @@ unsupported_function(int, tcsetattr, -1);
unsupported_function(int, tcgetattr, 0);
unsupported_function(int, poll, -1);
+/* net/if.h */
+unsupported_function_log(unsigned int, if_nametoindex, -1);
+unsupported_function_log(char *, if_indextoname, (char *) NULL);
+unsupported_function_log(struct if_nameindex *, if_nameindex, (struct if_nameindex *) NULL);
+unsupported_function_crash(if_freenameindex);
+
/* Linuxish abi for the Caml runtime, don't support */
unsupported_function_log(struct dirent *, readdir64, NULL);
unsupported_function_log(int, getrusage, -1);
diff --git a/tools/Makefile b/tools/Makefile
index 09964323e9..0d872b6fa4 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -9,6 +9,7 @@ SUBDIRS-y += flask
SUBDIRS-y += xenstore
SUBDIRS-y += misc
SUBDIRS-y += examples
+SUBDIRS-y += hotplug
SUBDIRS-y += xentrace
SUBDIRS-$(CONFIG_XCUTILS) += xcutils
SUBDIRS-$(CONFIG_X86) += firmware
diff --git a/tools/examples/Makefile b/tools/examples/Makefile
index 39310394f5..6c8349b154 100644
--- a/tools/examples/Makefile
+++ b/tools/examples/Makefile
@@ -24,41 +24,6 @@ XEN_CONFIGS += xmexample.vti
XEN_CONFIGS += xend-pci-quirks.sxp
XEN_CONFIGS += xend-pci-permissive.sxp
-# Xen script dir and scripts to go there.
-XEN_SCRIPT_DIR = /etc/xen/scripts
-XEN_SCRIPTS = network-bridge vif-bridge
-XEN_SCRIPTS += network-route vif-route
-XEN_SCRIPTS += network-nat vif-nat
-XEN_SCRIPTS += block
-XEN_SCRIPTS += block-enbd block-nbd
-XEN_SCRIPTS += blktap
-XEN_SCRIPTS += vtpm vtpm-delete
-XEN_SCRIPTS += xen-hotplug-cleanup
-XEN_SCRIPTS += external-device-migrate
-XEN_SCRIPTS += vscsi
-XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
-XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
-XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
-XEN_SCRIPT_DATA += vtpm-migration.sh vtpm-impl
-
-XEN_HOTPLUG_DIR = /etc/hotplug
-XEN_HOTPLUG_SCRIPTS = xen-backend.agent
-
-UDEV_RULES_DIR = /etc/udev
-UDEV_RULES = xen-backend.rules
-
-DI = $(if $(DISTDIR),$(shell readlink -f $(DISTDIR)),)
-DE = $(if $(DESTDIR),$(shell readlink -f $(DESTDIR)),)
-ifeq ($(findstring $(DI),$(DE)),$(DI))
-HOTPLUGS=install-hotplug install-udev
-else
-ifeq ($(shell [ -x /usr/bin/udevinfo ] && [ `/usr/bin/udevinfo -V | sed -e 's/^[^0-9]* \([0-9]\{1,\}\)[^0-9]\{0,\}/\1/'` -ge 059 ] && echo 1),1)
-HOTPLUGS=install-udev
-else
-HOTPLUGS=install-hotplug
-endif
-endif
-
.PHONY: all
all:
@@ -66,7 +31,7 @@ all:
build:
.PHONY: install
-install: all install-readmes install-initd install-configs install-scripts $(HOTPLUGS)
+install: all install-readmes install-configs $(HOTPLUGS)
.PHONY: install-readmes
install-readmes:
@@ -77,14 +42,6 @@ install-readmes:
$(INSTALL_DATA) $$i $(DESTDIR)$(XEN_CONFIG_DIR); \
done
-.PHONY: install-initd
-install-initd:
- [ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d
- [ -d $(DESTDIR)/etc/sysconfig ] || $(INSTALL_DIR) $(DESTDIR)/etc/sysconfig
- $(INSTALL_PROG) $(XEND_INITD) $(DESTDIR)/etc/init.d
- $(INSTALL_PROG) $(XENDOMAINS_INITD) $(DESTDIR)/etc/init.d
- $(INSTALL_PROG) $(XENDOMAINS_SYSCONFIG) $(DESTDIR)/etc/sysconfig/xendomains
-
.PHONY: install-configs
install-configs: $(XEN_CONFIGS)
[ -d $(DESTDIR)$(XEN_CONFIG_DIR) ] || \
@@ -96,19 +53,6 @@ install-configs: $(XEN_CONFIGS)
$(INSTALL_DATA) $$i $(DESTDIR)$(XEN_CONFIG_DIR); \
done
-.PHONY: install-scripts
-install-scripts:
- [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \
- $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
- set -e; for i in $(XEN_SCRIPTS); \
- do \
- $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
- done
- set -e; for i in $(XEN_SCRIPT_DATA); \
- do \
- $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
- done
-
.PHONY: install-hotplug
install-hotplug:
[ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \
diff --git a/tools/firmware/hvmloader/config.h b/tools/firmware/hvmloader/config.h
index 32011cd5a1..ea0c435ae4 100644
--- a/tools/firmware/hvmloader/config.h
+++ b/tools/firmware/hvmloader/config.h
@@ -23,7 +23,6 @@
/* Memory map. */
#define HYPERCALL_PHYSICAL_ADDRESS 0x00080000
#define VGABIOS_PHYSICAL_ADDRESS 0x000C0000
-#define ETHERBOOT_PHYSICAL_ADDRESS 0x000D0000
#define SMBIOS_PHYSICAL_ADDRESS 0x000E9000
#define SMBIOS_MAXIMUM_SIZE 0x00001000
#define ACPI_PHYSICAL_ADDRESS 0x000EA000
diff --git a/tools/firmware/hvmloader/hvmloader.c b/tools/firmware/hvmloader/hvmloader.c
index 9dff7cc08d..ec066cbf68 100644
--- a/tools/firmware/hvmloader/hvmloader.c
+++ b/tools/firmware/hvmloader/hvmloader.c
@@ -322,60 +322,56 @@ static void pci_setup(void)
}
/*
- * Scan the PCI bus for the first NIC supported by etherboot, and copy
- * the corresponding rom data to *copy_rom_dest. Returns the length of the
- * selected rom, or 0 if no NIC found.
+ * Scan the list of Option ROMs at @roms for one which supports
+ * PCI (@vendor_id, @device_id) found at slot @devfn. If one is found,
+ * copy it to @dest and return its size rounded up to a multiple 2kB. This
+ * function will not copy ROMs beyond address 0xE0000.
*/
-static int scan_etherboot_nic(void *copy_rom_dest)
+#define round_option_rom(x) (((x) + 2047) & ~2047)
+static int scan_option_rom(
+ uint8_t devfn, uint16_t vendor_id, uint16_t device_id,
+ void *roms, uint32_t dest)
{
struct option_rom_header *rom;
struct option_rom_pnp_header *pnph;
struct option_rom_pci_header *pcih;
- uint32_t devfn;
- uint16_t class, vendor_id, device_id;
uint8_t csum;
int i;
- for ( devfn = 0; devfn < 128; devfn++ )
- {
- class = pci_readw(devfn, PCI_CLASS_DEVICE);
- vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
- device_id = pci_readw(devfn, PCI_DEVICE_ID);
+ static uint32_t orom_ids[64];
+ static int nr_roms;
- if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
- continue;
+ /* Avoid duplicate ROMs. */
+ for ( i = 0; i < nr_roms; i++ )
+ if ( orom_ids[i] == (vendor_id | ((uint32_t)device_id << 16)) )
+ return 0;
- /* We're only interested in NICs. */
- if ( class != 0x0200 )
- continue;
+ rom = roms;
+ for ( ; ; )
+ {
+ /* Invalid signature means we're out of option ROMs. */
+ if ( strncmp((char *)rom->signature, "\x55\xaa", 2) ||
+ (rom->rom_size == 0) )
+ break;
- rom = (struct option_rom_header *)etherboot;
- for ( ; ; )
- {
- /* Invalid signature means we're out of option ROMs. */
- if ( strncmp((char *)rom->signature, "\x55\xaa", 2) ||
- (rom->rom_size == 0) )
- break;
-
- /* Invalid checksum means we're out of option ROMs. */
- csum = 0;
- for ( i = 0; i < (rom->rom_size * 512); i++ )
- csum += ((uint8_t *)rom)[i];
- if ( csum != 0 )
- break;
-
- /* Check the PCI PnP header (if any) for a match. */
- pcih = (struct option_rom_pci_header *)
- ((char *)rom + rom->pci_header_offset);
- if ( (rom->pci_header_offset != 0) &&
- !strncmp((char *)pcih->signature, "PCIR", 4) &&
- (pcih->vendor_id == vendor_id) &&
- (pcih->device_id == device_id) )
- goto found;
-
- rom = (struct option_rom_header *)
- ((char *)rom + rom->rom_size * 512);
- }
+ /* Invalid checksum means we're out of option ROMs. */
+ csum = 0;
+ for ( i = 0; i < (rom->rom_size * 512); i++ )
+ csum += ((uint8_t *)rom)[i];
+ if ( csum != 0 )
+ break;
+
+ /* Check the PCI PnP header (if any) for a match. */
+ pcih = (struct option_rom_pci_header *)
+ ((char *)rom + rom->pci_header_offset);
+ if ( (rom->pci_header_offset != 0) &&
+ !strncmp((char *)pcih->signature, "PCIR", 4) &&
+ (pcih->vendor_id == vendor_id) &&
+ (pcih->device_id == device_id) )
+ goto found;
+
+ rom = (struct option_rom_header *)
+ ((char *)rom + rom->rom_size * 512);
}
return 0;
@@ -392,15 +388,96 @@ static int scan_etherboot_nic(void *copy_rom_dest)
((char *)rom + pnph->next_header_offset))
: ((struct option_rom_pnp_header *)NULL));
- printf("Loading PXE ROM ...\n");
+ printf("Loading PCI Option ROM ...\n");
if ( (pnph != NULL) && (pnph->manufacturer_name_offset != 0) )
printf(" - Manufacturer: %s\n",
(char *)rom + pnph->manufacturer_name_offset);
if ( (pnph != NULL) && (pnph->product_name_offset != 0) )
printf(" - Product name: %s\n",
(char *)rom + pnph->product_name_offset);
- memcpy(copy_rom_dest, rom, rom->rom_size * 512);
- return rom->rom_size * 512;
+
+ if ( (dest + rom->rom_size * 512 + 1) > 0xe0000u )
+ {
+ printf("Option ROM size %x exceeds available space\n",
+ rom->rom_size * 512);
+ return 0;
+ }
+
+ orom_ids[nr_roms++] = vendor_id | ((uint32_t)device_id << 16);
+ memcpy((void *)dest, rom, rom->rom_size * 512);
+ *(uint8_t *)(dest + rom->rom_size * 512) = devfn;
+ return round_option_rom(rom->rom_size * 512 + 1);
+}
+
+/*
+ * Scan the PCI bus for the first NIC supported by etherboot, and copy
+ * the corresponding rom data to *copy_rom_dest. Returns the length of the
+ * selected rom, or 0 if no NIC found.
+ */
+static int scan_etherboot_nic(uint32_t copy_rom_dest)
+{
+ uint8_t devfn;
+ uint16_t class, vendor_id, device_id;
+
+ for ( devfn = 0; devfn < 128; devfn++ )
+ {
+ class = pci_readw(devfn, PCI_CLASS_DEVICE);
+ vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
+ device_id = pci_readw(devfn, PCI_DEVICE_ID);
+
+ /* We're only interested in NICs. */
+ if ( (vendor_id != 0xffff) &&
+ (device_id != 0xffff) &&
+ (class == 0x0200) )
+ return scan_option_rom(
+ devfn, vendor_id, device_id, etherboot, copy_rom_dest);
+ }
+
+ return 0;
+}
+
+/*
+ * Scan the PCI bus for the devices that have an option ROM, and copy
+ * the corresponding rom data to rom_phys_addr.
+ */
+static int pci_load_option_roms(uint32_t rom_base_addr)
+{
+ uint32_t option_rom_addr, rom_phys_addr = rom_base_addr;
+ uint16_t vendor_id, device_id;
+ uint8_t devfn, class;
+
+ for ( devfn = 0; devfn < 128; devfn++ )
+ {
+ class = pci_readb(devfn, PCI_CLASS_DEVICE + 1);
+ vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
+ device_id = pci_readw(devfn, PCI_DEVICE_ID);
+
+ if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
+ continue;
+
+ /*
+ * Currently only scan options from mass storage devices and serial
+ * bus controller (Fibre Channel included).
+ */
+ if ( (class != 0x1) && (class != 0xc) )
+ continue;
+
+ option_rom_addr = pci_readl(devfn, PCI_ROM_ADDRESS);
+ if ( !option_rom_addr )
+ continue;
+
+ /* Ensure Expansion Bar is enabled before copying */
+ pci_writel(devfn, PCI_ROM_ADDRESS, option_rom_addr | 0x1);
+
+ rom_phys_addr += scan_option_rom(
+ devfn, vendor_id, device_id,
+ (void *)(option_rom_addr & ~2047), rom_phys_addr);
+
+ /* Restore the default original value of Expansion Bar */
+ pci_writel(devfn, PCI_ROM_ADDRESS, option_rom_addr);
+ }
+
+ return rom_phys_addr - rom_base_addr;
}
/* Replace possibly erroneous memory-size CMOS fields with correct values. */
@@ -461,8 +538,9 @@ static uint16_t init_xen_platform_io_base(void)
int main(void)
{
- int vgabios_sz = 0, etherboot_sz = 0, rombios_sz, smbios_sz;
- uint32_t vga_ram = 0;
+ int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0;
+ int rombios_sz, smbios_sz;
+ uint32_t etherboot_phys_addr, option_rom_phys_addr, vga_ram = 0;
uint16_t xen_pfiob;
printf("HVM Loader\n");
@@ -497,13 +575,13 @@ int main(void)
printf("Loading Cirrus VGABIOS ...\n");
memcpy((void *)VGABIOS_PHYSICAL_ADDRESS,
vgabios_cirrusvga, sizeof(vgabios_cirrusvga));
- vgabios_sz = sizeof(vgabios_cirrusvga);
+ vgabios_sz = round_option_rom(sizeof(vgabios_cirrusvga));
break;
case VGA_std:
printf("Loading Standard VGABIOS ...\n");
memcpy((void *)VGABIOS_PHYSICAL_ADDRESS,
vgabios_stdvga, sizeof(vgabios_stdvga));
- vgabios_sz = sizeof(vgabios_stdvga);
+ vgabios_sz = round_option_rom(sizeof(vgabios_stdvga));
break;
default:
printf("No emulated VGA adaptor ...\n");
@@ -516,7 +594,11 @@ int main(void)
printf("VGA RAM at %08x\n", vga_ram);
}
- etherboot_sz = scan_etherboot_nic((void*)ETHERBOOT_PHYSICAL_ADDRESS);
+ etherboot_phys_addr = VGABIOS_PHYSICAL_ADDRESS + vgabios_sz;
+ etherboot_sz = scan_etherboot_nic(etherboot_phys_addr);
+
+ option_rom_phys_addr = etherboot_phys_addr + etherboot_sz;
+ option_rom_sz = pci_load_option_roms(option_rom_phys_addr);
if ( get_acpi_enabled() )
{
@@ -533,8 +615,12 @@ int main(void)
VGABIOS_PHYSICAL_ADDRESS + vgabios_sz - 1);
if ( etherboot_sz )
printf(" %05x-%05x: Etherboot ROM\n",
- ETHERBOOT_PHYSICAL_ADDRESS,
- ETHERBOOT_PHYSICAL_ADDRESS + etherboot_sz - 1);
+ etherboot_phys_addr,
+ etherboot_phys_addr + etherboot_sz - 1);
+ if ( option_rom_sz )
+ printf(" %05x-%05x: PCI Option ROMs\n",
+ option_rom_phys_addr,
+ option_rom_phys_addr + option_rom_sz - 1);
if ( smbios_sz )
printf(" %05x-%05x: SMBIOS tables\n",
SMBIOS_PHYSICAL_ADDRESS,
diff --git a/tools/firmware/rombios/rombios.c b/tools/firmware/rombios/rombios.c
index 0edd371765..05ee9875c6 100644
--- a/tools/firmware/rombios/rombios.c
+++ b/tools/firmware/rombios/rombios.c
@@ -9677,20 +9677,35 @@ block_count_rounded:
pop ds
pop ax
#endif
- xor bx, bx ;; Restore DS back to 0000:
- mov ds, bx
push ax ;; Save AX
push di ;; Save DI
;; Push addr of ROM entry point
push cx ;; Push seg
push #0x0003 ;; Push offset
+ ;; Get the BDF into ax before invoking the option ROM
+ mov bl, [2]
+ mov al, bl
+ shr al, #7
+ cmp al, #1
+ jne fetch_bdf
+ mov ax, ds ;; Increment the DS since rom size larger than an segment
+ add ax, #0x1000
+ mov ds, ax
+fetch_bdf:
+ shl bx, #9
+ xor ax, ax
+ mov al, [bx]
+
;; Point ES:DI at "$PnP", which tells the ROM that we are a PnP BIOS.
;; That should stop it grabbing INT 19h; we will use its BEV instead.
- mov ax, #0xf000
- mov es, ax
+ mov bx, #0xf000
+ mov es, bx
lea di, pnp_string
+ xor bx, bx ;; Restore DS back to 0000:
+ mov ds, bx
+
mov bp, sp ;; Call ROM init routine using seg:off on stack
db 0xff ;; call_far ss:[bp+0]
db 0x5e
diff --git a/tools/hotplug/Linux/Makefile b/tools/hotplug/Linux/Makefile
new file mode 100644
index 0000000000..19ba78ffdb
--- /dev/null
+++ b/tools/hotplug/Linux/Makefile
@@ -0,0 +1,97 @@
+XEN_ROOT = ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+# Init scripts.
+XEND_INITD = init.d/xend
+XENDOMAINS_INITD = init.d/xendomains
+XENDOMAINS_SYSCONFIG = init.d/sysconfig.xendomains
+
+# Xen configuration dir and configs to go there.
+XEN_CONFIG_DIR = /etc/xen
+
+# Xen script dir and scripts to go there.
+XEN_SCRIPT_DIR = /etc/xen/scripts
+XEN_SCRIPTS = network-bridge vif-bridge
+XEN_SCRIPTS += network-route vif-route
+XEN_SCRIPTS += network-nat vif-nat
+XEN_SCRIPTS += block
+XEN_SCRIPTS += block-enbd block-nbd
+XEN_SCRIPTS += blktap
+XEN_SCRIPTS += vtpm vtpm-delete
+XEN_SCRIPTS += xen-hotplug-cleanup
+XEN_SCRIPTS += external-device-migrate
+XEN_SCRIPTS += vscsi
+XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
+XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
+XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
+XEN_SCRIPT_DATA += vtpm-migration.sh vtpm-impl
+
+XEN_HOTPLUG_DIR = /etc/hotplug
+XEN_HOTPLUG_SCRIPTS = xen-backend.agent
+
+UDEV_RULES_DIR = /etc/udev
+UDEV_RULES = xen-backend.rules
+
+DI = $(if $(DISTDIR),$(shell readlink -f $(DISTDIR)),)
+DE = $(if $(DESTDIR),$(shell readlink -f $(DESTDIR)),)
+ifeq ($(findstring $(DI),$(DE)),$(DI))
+HOTPLUGS=install-hotplug install-udev
+else
+ifeq ($(shell [ -x /usr/bin/udevinfo ] && [ `/usr/bin/udevinfo -V | sed -e 's/^[^0-9]* \([0-9]\{1,\}\)[^0-9]\{0,\}/\1/'` -ge 059 ] && echo 1),1)
+HOTPLUGS=install-udev
+else
+HOTPLUGS=install-hotplug
+endif
+endif
+
+.PHONY: all
+all:
+
+.PHONY: build
+build:
+
+.PHONY: install
+install: all install-initd install-scripts $(HOTPLUGS)
+
+.PHONY: install-initd
+install-initd:
+ [ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d
+ [ -d $(DESTDIR)/etc/sysconfig ] || $(INSTALL_DIR) $(DESTDIR)/etc/sysconfig
+ $(INSTALL_PROG) $(XEND_INITD) $(DESTDIR)/etc/init.d
+ $(INSTALL_PROG) $(XENDOMAINS_INITD) $(DESTDIR)/etc/init.d
+ $(INSTALL_PROG) $(XENDOMAINS_SYSCONFIG) $(DESTDIR)/etc/sysconfig/xendomains
+
+.PHONY: install-scripts
+install-scripts:
+ [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \
+ $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
+ set -e; for i in $(XEN_SCRIPTS); \
+ do \
+ $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+ done
+ set -e; for i in $(XEN_SCRIPT_DATA); \
+ do \
+ $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+ done
+
+.PHONY: install-hotplug
+install-hotplug:
+ [ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \
+ $(INSTALL_DIR) $(DESTDIR)$(XEN_HOTPLUG_DIR)
+ set -e; for i in $(XEN_HOTPLUG_SCRIPTS); \
+ do \
+ $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_HOTPLUG_DIR); \
+ done
+
+.PHONY: install-udev
+install-udev:
+ [ -d $(DESTDIR)$(UDEV_RULES_DIR) ] || \
+ $(INSTALL_DIR) $(DESTDIR)$(UDEV_RULES_DIR)/rules.d
+ set -e; for i in $(UDEV_RULES); \
+ do \
+ $(INSTALL_DATA) $$i $(DESTDIR)$(UDEV_RULES_DIR); \
+ ln -sf ../$$i $(DESTDIR)$(UDEV_RULES_DIR)/rules.d; \
+ done
+
+.PHONY: clean
+clean:
diff --git a/tools/examples/blktap b/tools/hotplug/Linux/blktap
index 01a0f6c6da..01a0f6c6da 100644
--- a/tools/examples/blktap
+++ b/tools/hotplug/Linux/blktap
diff --git a/tools/examples/block b/tools/hotplug/Linux/block
index 8c61744c83..8c61744c83 100644
--- a/tools/examples/block
+++ b/tools/hotplug/Linux/block
diff --git a/tools/examples/block-common.sh b/tools/hotplug/Linux/block-common.sh
index a0ebc9b12a..a0ebc9b12a 100644
--- a/tools/examples/block-common.sh
+++ b/tools/hotplug/Linux/block-common.sh
diff --git a/tools/examples/block-enbd b/tools/hotplug/Linux/block-enbd
index 67faa84268..67faa84268 100755..100644
--- a/tools/examples/block-enbd
+++ b/tools/hotplug/Linux/block-enbd
diff --git a/tools/examples/block-nbd b/tools/hotplug/Linux/block-nbd
index b29b31564a..b29b31564a 100644
--- a/tools/examples/block-nbd
+++ b/tools/hotplug/Linux/block-nbd
diff --git a/tools/examples/external-device-migrate b/tools/hotplug/Linux/external-device-migrate
index a4113483a8..a4113483a8 100644
--- a/tools/examples/external-device-migrate
+++ b/tools/hotplug/Linux/external-device-migrate
diff --git a/tools/examples/init.d/sysconfig.xendomains b/tools/hotplug/Linux/init.d/sysconfig.xendomains
index e93b1a40b9..e93b1a40b9 100644
--- a/tools/examples/init.d/sysconfig.xendomains
+++ b/tools/hotplug/Linux/init.d/sysconfig.xendomains
diff --git a/tools/examples/init.d/xend b/tools/hotplug/Linux/init.d/xend
index 4bfc799465..4bfc799465 100755
--- a/tools/examples/init.d/xend
+++ b/tools/hotplug/Linux/init.d/xend
diff --git a/tools/examples/init.d/xendomains b/tools/hotplug/Linux/init.d/xendomains
index 5c2e492f03..5c2e492f03 100644
--- a/tools/examples/init.d/xendomains
+++ b/tools/hotplug/Linux/init.d/xendomains
diff --git a/tools/examples/locking.sh b/tools/hotplug/Linux/locking.sh
index 6ff58e7e6c..6ff58e7e6c 100644
--- a/tools/examples/locking.sh
+++ b/tools/hotplug/Linux/locking.sh
diff --git a/tools/examples/logging.sh b/tools/hotplug/Linux/logging.sh
index c1bc699c7b..c1bc699c7b 100644
--- a/tools/examples/logging.sh
+++ b/tools/hotplug/Linux/logging.sh
diff --git a/tools/examples/network-bridge b/tools/hotplug/Linux/network-bridge
index 9d7be4e2e5..9d7be4e2e5 100755..100644
--- a/tools/examples/network-bridge
+++ b/tools/hotplug/Linux/network-bridge
diff --git a/tools/examples/network-nat b/tools/hotplug/Linux/network-nat
index d9c62c6160..d9c62c6160 100644
--- a/tools/examples/network-nat
+++ b/tools/hotplug/Linux/network-nat
diff --git a/tools/examples/network-route b/tools/hotplug/Linux/network-route
index 574441e334..574441e334 100755..100644
--- a/tools/examples/network-route
+++ b/tools/hotplug/Linux/network-route
diff --git a/tools/examples/vif-bridge b/tools/hotplug/Linux/vif-bridge
index 1b698d703b..1b698d703b 100755..100644
--- a/tools/examples/vif-bridge
+++ b/tools/hotplug/Linux/vif-bridge
diff --git a/tools/examples/vif-common.sh b/tools/hotplug/Linux/vif-common.sh
index ee67ee2aaa..ee67ee2aaa 100644
--- a/tools/examples/vif-common.sh
+++ b/tools/hotplug/Linux/vif-common.sh
diff --git a/tools/examples/vif-nat b/tools/hotplug/Linux/vif-nat
index 75bdf5c444..75bdf5c444 100644
--- a/tools/examples/vif-nat
+++ b/tools/hotplug/Linux/vif-nat
diff --git a/tools/examples/vif-route b/tools/hotplug/Linux/vif-route
index f5fd88ed5a..f5fd88ed5a 100755..100644
--- a/tools/examples/vif-route
+++ b/tools/hotplug/Linux/vif-route
diff --git a/tools/examples/vscsi b/tools/hotplug/Linux/vscsi
index 5ac26147ec..5ac26147ec 100644
--- a/tools/examples/vscsi
+++ b/tools/hotplug/Linux/vscsi
diff --git a/tools/examples/vtpm b/tools/hotplug/Linux/vtpm
index 38a4532fc2..38a4532fc2 100644
--- a/tools/examples/vtpm
+++ b/tools/hotplug/Linux/vtpm
diff --git a/tools/examples/vtpm-common.sh b/tools/hotplug/Linux/vtpm-common.sh
index a45868eefd..a45868eefd 100644
--- a/tools/examples/vtpm-common.sh
+++ b/tools/hotplug/Linux/vtpm-common.sh
diff --git a/tools/examples/vtpm-delete b/tools/hotplug/Linux/vtpm-delete
index b75b95bf0a..b75b95bf0a 100644
--- a/tools/examples/vtpm-delete
+++ b/tools/hotplug/Linux/vtpm-delete
diff --git a/tools/examples/vtpm-hotplug-common.sh b/tools/hotplug/Linux/vtpm-hotplug-common.sh
index 9fd35e7402..9fd35e7402 100644
--- a/tools/examples/vtpm-hotplug-common.sh
+++ b/tools/hotplug/Linux/vtpm-hotplug-common.sh
diff --git a/tools/examples/vtpm-impl b/tools/hotplug/Linux/vtpm-impl
index 4f9a1fd85e..4f9a1fd85e 100644
--- a/tools/examples/vtpm-impl
+++ b/tools/hotplug/Linux/vtpm-impl
diff --git a/tools/examples/vtpm-migration.sh b/tools/hotplug/Linux/vtpm-migration.sh
index 7e38ae26f0..7e38ae26f0 100644
--- a/tools/examples/vtpm-migration.sh
+++ b/tools/hotplug/Linux/vtpm-migration.sh
diff --git a/tools/examples/xen-backend.agent b/tools/hotplug/Linux/xen-backend.agent
index 5cb536a6a9..5cb536a6a9 100755..100644
--- a/tools/examples/xen-backend.agent
+++ b/tools/hotplug/Linux/xen-backend.agent
diff --git a/tools/examples/xen-backend.rules b/tools/hotplug/Linux/xen-backend.rules
index fe21fc1357..fe21fc1357 100644
--- a/tools/examples/xen-backend.rules
+++ b/tools/hotplug/Linux/xen-backend.rules
diff --git a/tools/examples/xen-hotplug-cleanup b/tools/hotplug/Linux/xen-hotplug-cleanup
index f7337e45bf..f7337e45bf 100644
--- a/tools/examples/xen-hotplug-cleanup
+++ b/tools/hotplug/Linux/xen-hotplug-cleanup
diff --git a/tools/examples/xen-hotplug-common.sh b/tools/hotplug/Linux/xen-hotplug-common.sh
index 980a62704e..980a62704e 100644
--- a/tools/examples/xen-hotplug-common.sh
+++ b/tools/hotplug/Linux/xen-hotplug-common.sh
diff --git a/tools/examples/xen-network-common.sh b/tools/hotplug/Linux/xen-network-common.sh
index 7014333df0..7014333df0 100644
--- a/tools/examples/xen-network-common.sh
+++ b/tools/hotplug/Linux/xen-network-common.sh
diff --git a/tools/examples/xen-script-common.sh b/tools/hotplug/Linux/xen-script-common.sh
index f6841acffa..f6841acffa 100644
--- a/tools/examples/xen-script-common.sh
+++ b/tools/hotplug/Linux/xen-script-common.sh
diff --git a/tools/hotplug/Makefile b/tools/hotplug/Makefile
new file mode 100644
index 0000000000..979e916d7f
--- /dev/null
+++ b/tools/hotplug/Makefile
@@ -0,0 +1,9 @@
+XEN_ROOT = ../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+SUBDIRS-y := common
+SUBDIRS-$(CONFIG_NetBSD) += NetBSD
+SUBDIRS-$(CONFIG_Linux) += Linux
+
+.PHONY: all clean install
+all clean install: %: subdirs-%
diff --git a/tools/hotplug/NetBSD/Makefile b/tools/hotplug/NetBSD/Makefile
new file mode 100644
index 0000000000..1d369eaf9a
--- /dev/null
+++ b/tools/hotplug/NetBSD/Makefile
@@ -0,0 +1,41 @@
+XEN_ROOT = ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+# Xen configuration dir and configs to go there.
+XEN_CONFIG_DIR = $(PREFIX)/etc/xen
+
+# Xen script dir and scripts to go there.
+XEN_SCRIPT_DIR = $(PREFIX)/etc/xen/scripts
+XEN_SCRIPTS =
+XEN_SCRIPTS += block-nbsd
+XEN_SCRIPTS += hvm-nbsd
+XEN_SCRIPTS += netbsd1-nbsd
+XEN_SCRIPTS += qemu-ifup-nbsd
+XEN_SCRIPTS += vif-bridge-nbsd
+XEN_SCRIPTS += vif-ip-nbsd
+
+XEN_SCRIPT_DATA =
+
+.PHONY: all
+all:
+
+.PHONY: build
+build:
+
+.PHONY: install
+install: all install-scripts
+
+.PHONY: install-scripts
+install-scripts:
+ $(INSTALL_DATA_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
+ set -e; for i in $(XEN_SCRIPTS); \
+ do \
+ $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+ done
+ set -e; for i in $(XEN_SCRIPT_DATA); \
+ do \
+ $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+ done
+
+.PHONY: clean
+clean:
diff --git a/tools/hotplug/NetBSD/block-nbsd b/tools/hotplug/NetBSD/block-nbsd
new file mode 100644
index 0000000000..915ddb755a
--- /dev/null
+++ b/tools/hotplug/NetBSD/block-nbsd
@@ -0,0 +1,88 @@
+#!/bin/sh -e
+
+# $NetBSD: block-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $
+# Called by xenbackendd
+# Usage: block xsdir_backend_path state
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+error() {
+ echo "$@" >&2
+ xenstore_write $xpath/hotplug-status error
+ exit 1
+}
+
+
+xpath=$1
+xstatus=$2
+xtype=$(xenstore-read "$xpath/type")
+xparams=$(xenstore-read "$xpath/params")
+
+case $xstatus in
+6)
+ # device removed
+ case $xtype in
+ file)
+ vnd=$(xenstore-read "$xpath/vnd" || echo none)
+ if [ $vnd != none ]; then
+ vnconfig -u $vnd
+ fi
+ ;;
+ phy)
+ ;;
+ *)
+ echo "unknown type $xtype" >&2
+ ;;
+ esac
+ xenstore-rm $xpath
+ exit 0
+ ;;
+2)
+ case $xtype in
+ file)
+ # Store the list of available vnd(4) devices in
+ #``available_disks'', and mark them as ``free''.
+ list=`ls -1 /dev/vnd[0-9]*d | sed "s,/dev/vnd,,;s,d,," | sort -n`
+ for i in $list; do
+ disk="vnd$i"
+ available_disks="$available_disks $disk"
+ eval $disk=free
+ done
+ # Mark the used vnd(4) devices as ``used''.
+ for disk in `sysctl hw.disknames`; do
+ case $disk in
+ vnd[0-9]*) eval $disk=used ;;
+ esac
+ done
+ # Configure the first free vnd(4) device.
+ for disk in $available_disks; do
+ eval status=\$$disk
+ if [ "$status" = "free" ] && \
+ vnconfig /dev/${disk}d $xparams >/dev/null; then
+ device=/dev/${disk}d
+ echo vnconfig /dev/${disk}d $xparams
+ break
+ fi
+ done
+ if [ x$device = x ] ; then
+ error "no available vnd device"
+ fi
+ echo xenstore-write $xpath/vnd $device
+ xenstore-write $xpath/vnd $device
+ ;;
+ phy)
+ device=$xparams
+ ;;
+ esac
+ physical_device=$(stat -f '%r' "$device")
+ echo xenstore-write $xpath/physical-device $physical_device
+ xenstore-write $xpath/physical-device $physical_device
+ echo xenstore-write $xpath/hotplug-status connected
+ xenstore-write $xpath/hotplug-status connected
+ exit 0
+ ;;
+*)
+ exit 0
+ ;;
+esac
diff --git a/tools/hotplug/NetBSD/qemu-ifup-nbsd b/tools/hotplug/NetBSD/qemu-ifup-nbsd
new file mode 100644
index 0000000000..eee78765d6
--- /dev/null
+++ b/tools/hotplug/NetBSD/qemu-ifup-nbsd
@@ -0,0 +1,3 @@
+#!/bin/sh
+ifconfig $1 up
+exec /sbin/brconfig $2 add $1
diff --git a/tools/hotplug/NetBSD/vif-bridge-nbsd b/tools/hotplug/NetBSD/vif-bridge-nbsd
new file mode 100644
index 0000000000..bedb387953
--- /dev/null
+++ b/tools/hotplug/NetBSD/vif-bridge-nbsd
@@ -0,0 +1,35 @@
+#!/bin/sh -e
+
+# $NetBSD: vif-bridge-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $
+# Called by xenbackendd
+# Usage: vif-bridge xsdir_backend_path state
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+xpath=$1
+xstatus=$2
+
+case $xstatus in
+6)
+ # device removed
+ xenstore-rm $xpath
+ exit 0
+ ;;
+2)
+ xbridge=$(xenstore-read "$xpath/bridge")
+ xfid=$(xenstore-read "$xpath/frontend-id")
+ xhandle=$(xenstore-read "$xpath/handle")
+ iface=xvif$xfid.$xhandle
+ echo ifconfig $iface up
+ ifconfig $iface up
+ brconfig $xbridge add $iface
+ echo brconfig $xbridge add $iface
+ xenstore-write $xpath/hotplug-status connected
+ echo xenstore-write $xpath/hotplug-status connected
+ exit 0
+ ;;
+*)
+ exit 0
+ ;;
+esac
diff --git a/tools/hotplug/NetBSD/vif-ip-nbsd b/tools/hotplug/NetBSD/vif-ip-nbsd
new file mode 100644
index 0000000000..d8b5bb9759
--- /dev/null
+++ b/tools/hotplug/NetBSD/vif-ip-nbsd
@@ -0,0 +1,33 @@
+#!/bin/sh -e
+
+# $NetBSD: vif-ip-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $
+# Called by xenbackendd
+# Usage: vif-ip xsdir_backend_path state
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+xpath=$1
+xstatus=$2
+
+case $xstatus in
+6)
+ # device removed
+ xenstore-rm $xpath
+ exit 0
+ ;;
+2)
+ xip=$(xenstore-read "$xpath/ip")
+ xfid=$(xenstore-read "$xpath/frontend-id")
+ xhandle=$(xenstore-read "$xpath/handle")
+ iface=xvif$xfid.$xhandle
+ echo ifconfig $iface $xip up
+ ifconfig $iface $xip up
+ xenstore-write $xpath/hotplug-status connected
+ echo xenstore-write $xpath/hotplug-status connected
+ exit 0
+ ;;
+*)
+ exit 0
+ ;;
+esac
diff --git a/tools/hotplug/common/Makefile b/tools/hotplug/common/Makefile
new file mode 100644
index 0000000000..b69b9991af
--- /dev/null
+++ b/tools/hotplug/common/Makefile
@@ -0,0 +1,37 @@
+XEN_ROOT = ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+# OS-independent hotplug scripts go in this directory
+
+# Xen configuration dir and configs to go there.
+XEN_CONFIG_DIR = /etc/xen
+
+# Xen script dir and scripts to go there.
+XEN_SCRIPT_DIR = /etc/xen/scripts
+XEN_SCRIPTS =
+XEN_SCRIPT_DATA =
+
+.PHONY: all
+all:
+
+.PHONY: build
+build:
+
+.PHONY: install
+install: all install-scripts
+
+.PHONY: install-scripts
+install-scripts:
+ [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \
+ $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
+ set -e; for i in $(XEN_SCRIPTS); \
+ do \
+ $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+ done
+ set -e; for i in $(XEN_SCRIPT_DATA); \
+ do \
+ $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+ done
+
+.PHONY: clean
+clean:
diff --git a/tools/libxc/xc_cpufeature.h b/tools/libxc/xc_cpufeature.h
index 6cd442cfe6..047a6c9fc7 100644
--- a/tools/libxc/xc_cpufeature.h
+++ b/tools/libxc/xc_cpufeature.h
@@ -83,6 +83,7 @@
#define X86_FEATURE_SSE4_1 (4*32+19) /* Streaming SIMD Extensions 4.1 */
#define X86_FEATURE_SSE4_2 (4*32+20) /* Streaming SIMD Extensions 4.2 */
#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index 6a8e7594c8..d75fe2b013 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -194,6 +194,8 @@ static void xc_cpuid_hvm_policy(
bitmaskof(X86_FEATURE_SSE4_2) |
bitmaskof(X86_FEATURE_POPCNT));
+ regs[2] |= bitmaskof(X86_FEATURE_HYPERVISOR);
+
regs[3] &= (bitmaskof(X86_FEATURE_FPU) |
bitmaskof(X86_FEATURE_VME) |
bitmaskof(X86_FEATURE_DE) |
@@ -309,6 +311,7 @@ static void xc_cpuid_pv_policy(
clear_bit(X86_FEATURE_XTPR, regs[2]);
clear_bit(X86_FEATURE_PDCM, regs[2]);
clear_bit(X86_FEATURE_DCA, regs[2]);
+ set_bit(X86_FEATURE_HYPERVISOR, regs[2]);
break;
case 0x80000001:
if ( !guest_64bit )
diff --git a/tools/misc/xenpm.c b/tools/misc/xenpm.c
index 618aa27a84..ace72b4216 100644
--- a/tools/misc/xenpm.c
+++ b/tools/misc/xenpm.c
@@ -170,7 +170,7 @@ int main(int argc, char **argv)
if ( !pxstat->pt )
{
fprintf(stderr, "failed to malloc for P-states table\n");
- free(pxstat->pt);
+ free(pxstat->trans_pt);
break;
}
diff --git a/tools/python/xen/util/rwlock.py b/tools/python/xen/util/rwlock.py
new file mode 100644
index 0000000000..e79a82f8e8
--- /dev/null
+++ b/tools/python/xen/util/rwlock.py
@@ -0,0 +1,137 @@
+""" Reader-writer lock implementation based on a condition variable """
+
+#============================================================================
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#============================================================================
+# Copyright (C) 2008 International Business Machines Corp.
+# Author: Stefan Berger <stefanb@us.ibm.com>
+#============================================================================
+
+from threading import Condition
+
+class RWLock:
+
+ RWLOCK_STATE_WRITER = -1
+ RWLOCK_STATE_UNUSED = 0
+
+ def __init__(self):
+ self.__condition = Condition()
+ self.__state = RWLock.RWLOCK_STATE_UNUSED
+ self.__blocked_writers = 0
+
+ def acquire_reader(self):
+ self.__condition.acquire()
+ while True:
+ if self.__state == RWLock.RWLOCK_STATE_WRITER:
+ self.__condition.wait()
+ else:
+ break
+ self.__state += 1
+ self.__condition.release()
+
+ def acquire_writer(self):
+ self.__condition.acquire()
+ self.__acquire_writer(RWLock.RWLOCK_STATE_UNUSED)
+ self.__condition.release()
+
+ def __acquire_writer(self, wait_for_state):
+ while True:
+ if self.__state == wait_for_state:
+ self.__state = RWLock.RWLOCK_STATE_WRITER
+ break
+ else:
+ self.__blocked_writers += 1
+ self.__condition.wait()
+ self.__blocked_writers -= 1
+
+ def release(self):
+ self.__condition.acquire()
+ if self.__state == RWLock.RWLOCK_STATE_WRITER:
+ self.__state = RWLock.RWLOCK_STATE_UNUSED
+ elif self.__state == RWLock.RWLOCK_STATE_UNUSED:
+ assert False, 'Lock not in use.'
+ else:
+ self.__state -= 1
+ self.__condition.notifyAll()
+ self.__condition.release()
+
+
+if __name__ == '__main__':
+ from threading import Thread
+ from time import sleep
+
+ rwlock = RWLock()
+
+ class Base(Thread):
+ def __init__(self, name, timeout):
+ self.name = name
+ self.timeout = timeout
+ Thread.__init__(self)
+
+ class Reader(Base):
+ def __init__(self, name = 'Reader', timeout = 10):
+ Base.__init__(self, name, timeout)
+
+ def run(self):
+ print '%s begin' % self.name
+ rwlock.acquire_reader()
+ print '%s acquired' % self.name
+ sleep(self.timeout)
+ rwlock.release()
+ print '%s end' % self.name
+
+ class ReaderTwice(Base):
+ def __init__(self, name = 'Reader', timeout = 10):
+ Base.__init__(self, name, timeout)
+
+ def run(self):
+ print '%s begin' % self.name
+ rwlock.acquire_reader()
+ print '%s acquired once' % self.name
+ sleep(self.timeout)
+ rwlock.acquire_reader()
+ print '%s acquired twice' % self.name
+ sleep(self.timeout)
+ rwlock.release()
+ rwlock.release()
+ print '%s end' % self.name
+
+ class Writer(Base):
+ def __init__(self, name = 'Writer', timeout = 10):
+ Base.__init__(self, name, timeout)
+
+ def run(self):
+ print '%s begin' % self.name
+ rwlock.acquire_writer()
+ print '%s acquired' % self.name
+ sleep(self.timeout)
+ rwlock.release()
+ print '%s end' % self.name
+
+ def run_test(threadlist, msg):
+ print msg
+ for t in threadlist:
+ t.start()
+ sleep(1)
+ for t in threads:
+ t.join()
+ print 'Done\n\n'
+
+ threads = []
+ threads.append( Reader('R1', 4) )
+ threads.append( Reader('R2', 4) )
+ threads.append( Writer('W1', 4) )
+ threads.append( Reader('R3', 4) )
+ run_test(threads,
+ 'Test: readers may bypass blocked writers')
diff --git a/tools/python/xen/xend/XendAPI.py b/tools/python/xen/xend/XendAPI.py
index 42e131be37..71aac01f90 100644
--- a/tools/python/xen/xend/XendAPI.py
+++ b/tools/python/xen/xend/XendAPI.py
@@ -431,7 +431,7 @@ def valid_object(class_name):
lambda *args, **kwargs: \
_check_ref(lambda r: \
XendAPIStore.get(r, class_name) is not None,
- 'PIF', func, *args, **kwargs)
+ class_name, func, *args, **kwargs)
# -----------------------------
# Bridge to Legacy XM API calls
diff --git a/tools/python/xen/xend/XendConfig.py b/tools/python/xen/xend/XendConfig.py
index eb868e7b29..0773fa5e57 100644
--- a/tools/python/xen/xend/XendConfig.py
+++ b/tools/python/xen/xend/XendConfig.py
@@ -1032,8 +1032,6 @@ class XendConfig(dict):
sxpr.append([name, s])
for xenapi, legacy in XENAPI_CFG_TO_LEGACY_CFG.items():
- if legacy in ('cpus'): # skip this
- continue
if self.has_key(xenapi) and self[xenapi] not in (None, []):
if type(self[xenapi]) == bool:
# convert booleans to ints before making an sxp item
diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/XendDomain.py
index 9faebe95aa..d5195c421e 100644
--- a/tools/python/xen/xend/XendDomain.py
+++ b/tools/python/xen/xend/XendDomain.py
@@ -50,7 +50,7 @@ from xen.xend.XendAPIConstants import *
from xen.xend.xenstore.xstransact import xstransact
from xen.xend.xenstore.xswatch import xswatch
-from xen.util import mkdir
+from xen.util import mkdir, rwlock
from xen.xend import uuid
xc = xen.lowlevel.xc.xc()
@@ -93,6 +93,8 @@ class XendDomain:
self.managed_domains = {}
self.domains_lock = threading.RLock()
+ self.policy_lock = rwlock.RWLock()
+
# xen api instance vars
# TODO: nothing uses this at the moment
self._allow_new_domains = True
@@ -1139,16 +1141,21 @@ class XendDomain:
"""
try:
- return XendCheckpoint.restore(self, fd, paused=paused, relocating=relocating)
- except XendError, e:
- log.exception("Restore failed")
- raise
- except:
- # I don't really want to log this exception here, but the error
- # handling in the relocation-socket handling code (relocate.py) is
- # poor, so we need to log this for debugging.
- log.exception("Restore failed")
- raise XendError("Restore failed")
+ self.policy_lock.acquire_reader()
+
+ try:
+ return XendCheckpoint.restore(self, fd, paused=paused, relocating=relocating)
+ except XendError, e:
+ log.exception("Restore failed")
+ raise
+ except:
+ # I don't really want to log this exception here, but the error
+ # handling in the relocation-socket handling code (relocate.py) is
+ # poor, so we need to log this for debugging.
+ log.exception("Restore failed")
+ raise XendError("Restore failed")
+ finally:
+ self.policy_lock.release()
def domain_unpause(self, domid):
"""Unpause domain execution.
diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py
index d0ade8c858..bab9d3aef2 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -479,6 +479,14 @@ class XendDomainInfo:
if state in (DOM_STATE_SUSPENDED, DOM_STATE_HALTED):
try:
self._constructDomain()
+
+ try:
+ self._setCPUAffinity()
+ except:
+ # usually a CPU we want to set affinity to does not exist
+ # we just ignore it so that the domain can still be restored
+ log.warn("Cannot restore CPU affinity")
+
self._storeVmDetails()
self._createChannels()
self._createDevices()
@@ -2166,6 +2174,64 @@ class XendDomainInfo:
raise XendError(str(exn))
+ def _setCPUAffinity(self):
+ """ Repin domain vcpus if a restricted cpus list is provided
+ """
+
+ def has_cpus():
+ if self.info['cpus'] is not None:
+ for c in self.info['cpus']:
+ if c:
+ return True
+ return False
+
+ if has_cpus():
+ for v in range(0, self.info['VCPUs_max']):
+ if self.info['cpus'][v]:
+ xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v])
+ else:
+ def find_relaxed_node(node_list):
+ import sys
+ nr_nodes = info['nr_nodes']
+ if node_list is None:
+ node_list = range(0, nr_nodes)
+ nodeload = [0]
+ nodeload = nodeload * nr_nodes
+ from xen.xend import XendDomain
+ doms = XendDomain.instance().list('all')
+ for dom in filter (lambda d: d.domid != self.domid, doms):
+ cpuinfo = dom.getVCPUInfo()
+ for vcpu in sxp.children(cpuinfo, 'vcpu'):
+ if sxp.child_value(vcpu, 'online') == 0: continue
+ cpumap = list(sxp.child_value(vcpu,'cpumap'))
+ for i in range(0, nr_nodes):
+ node_cpumask = info['node_to_cpu'][i]
+ for j in node_cpumask:
+ if j in cpumap:
+ nodeload[i] += 1
+ break
+ for i in range(0, nr_nodes):
+ if len(info['node_to_cpu'][i]) > 0 and i in node_list:
+ nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i]))
+ else:
+ nodeload[i] = sys.maxint
+ index = nodeload.index( min(nodeload) )
+ return index
+
+ info = xc.physinfo()
+ if info['nr_nodes'] > 1:
+ node_memory_list = info['node_to_memory']
+ needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
+ candidate_node_list = []
+ for i in range(0, info['nr_nodes']):
+ if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0:
+ candidate_node_list.append(i)
+ index = find_relaxed_node(candidate_node_list)
+ cpumask = info['node_to_cpu'][index]
+ for v in range(0, self.info['VCPUs_max']):
+ xc.vcpu_setaffinity(self.domid, v, cpumask)
+
+
def _initDomain(self):
log.debug('XendDomainInfo.initDomain: %s %s',
self.domid,
@@ -2185,58 +2251,7 @@ class XendDomainInfo:
# repin domain vcpus if a restricted cpus list is provided
# this is done prior to memory allocation to aide in memory
# distribution for NUMA systems.
- def has_cpus():
- if self.info['cpus'] is not None:
- for c in self.info['cpus']:
- if c:
- return True
- return False
-
- if has_cpus():
- for v in range(0, self.info['VCPUs_max']):
- if self.info['cpus'][v]:
- xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v])
- else:
- def find_relaxed_node(node_list):
- import sys
- nr_nodes = info['nr_nodes']
- if node_list is None:
- node_list = range(0, nr_nodes)
- nodeload = [0]
- nodeload = nodeload * nr_nodes
- from xen.xend import XendDomain
- doms = XendDomain.instance().list('all')
- for dom in filter (lambda d: d.domid != self.domid, doms):
- cpuinfo = dom.getVCPUInfo()
- for vcpu in sxp.children(cpuinfo, 'vcpu'):
- if sxp.child_value(vcpu, 'online') == 0: continue
- cpumap = list(sxp.child_value(vcpu,'cpumap'))
- for i in range(0, nr_nodes):
- node_cpumask = info['node_to_cpu'][i]
- for j in node_cpumask:
- if j in cpumap:
- nodeload[i] += 1
- break
- for i in range(0, nr_nodes):
- if len(info['node_to_cpu'][i]) > 0 and i in node_list:
- nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i]))
- else:
- nodeload[i] = sys.maxint
- index = nodeload.index( min(nodeload) )
- return index
-
- info = xc.physinfo()
- if info['nr_nodes'] > 1:
- node_memory_list = info['node_to_memory']
- needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
- candidate_node_list = []
- for i in range(0, info['nr_nodes']):
- if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0:
- candidate_node_list.append(i)
- index = find_relaxed_node(candidate_node_list)
- cpumask = info['node_to_cpu'][index]
- for v in range(0, self.info['VCPUs_max']):
- xc.vcpu_setaffinity(self.domid, v, cpumask)
+ self._setCPUAffinity()
# Use architecture- and image-specific calculations to determine
# the various headrooms necessary, given the raw configured
@@ -3011,64 +3026,69 @@ class XendDomainInfo:
if not xspol:
xspol = poladmin.get_policy_by_name(policy)
- if state in [ DOM_STATE_RUNNING, DOM_STATE_PAUSED ]:
- #if domain is running or paused try to relabel in hypervisor
- if not xspol:
- return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
-
- if typ != xspol.get_type_name() or \
- policy != xspol.get_name():
- return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+ try:
+ xen.xend.XendDomain.instance().policy_lock.acquire_writer()
- if typ == xsconstants.ACM_POLICY_ID:
- new_ssidref = xspol.vmlabel_to_ssidref(label)
- if new_ssidref == xsconstants.INVALID_SSIDREF:
- return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+ if state in [ DOM_STATE_RUNNING, DOM_STATE_PAUSED ]:
+ #if domain is running or paused try to relabel in hypervisor
+ if not xspol:
+ return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
- # Check that all used resources are accessible under the
- # new label
- if not is_policy_update and \
- not security.resources_compatible_with_vmlabel(xspol,
- self, label):
+ if typ != xspol.get_type_name() or \
+ policy != xspol.get_name():
return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
- #Check label against expected one. Can only do this
- # if the policy hasn't changed underneath in the meantime
- if xspol_old == None:
- old_label = self.get_security_label()
- if old_label != old_seclab:
- log.info("old_label != old_seclab: %s != %s" %
- (old_label, old_seclab))
+ if typ == xsconstants.ACM_POLICY_ID:
+ new_ssidref = xspol.vmlabel_to_ssidref(label)
+ if new_ssidref == xsconstants.INVALID_SSIDREF:
return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
- # relabel domain in the hypervisor
- rc, errors = security.relabel_domains([[domid, new_ssidref]])
- log.info("rc from relabeling in HV: %d" % rc)
- else:
- return (-xsconstants.XSERR_POLICY_TYPE_UNSUPPORTED, "", "", 0)
+ # Check that all used resources are accessible under the
+ # new label
+ if not is_policy_update and \
+ not security.resources_compatible_with_vmlabel(xspol,
+ self, label):
+ return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
- if rc == 0:
- # HALTED, RUNNING or PAUSED
- if domid == 0:
- if xspol:
- self.info['security_label'] = seclab
- ssidref = poladmin.set_domain0_bootlabel(xspol, label)
+ #Check label against expected one. Can only do this
+ # if the policy hasn't changed underneath in the meantime
+ if xspol_old == None:
+ old_label = self.get_security_label()
+ if old_label != old_seclab:
+ log.info("old_label != old_seclab: %s != %s" %
+ (old_label, old_seclab))
+ return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+
+ # relabel domain in the hypervisor
+ rc, errors = security.relabel_domains([[domid, new_ssidref]])
+ log.info("rc from relabeling in HV: %d" % rc)
else:
- return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
- else:
- if self.info.has_key('security_label'):
- old_label = self.info['security_label']
- # Check label against expected one, unless wildcard
- if old_label != old_seclab:
- return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+ return (-xsconstants.XSERR_POLICY_TYPE_UNSUPPORTED, "", "", 0)
+
+ if rc == 0:
+ # HALTED, RUNNING or PAUSED
+ if domid == 0:
+ if xspol:
+ self.info['security_label'] = seclab
+ ssidref = poladmin.set_domain0_bootlabel(xspol, label)
+ else:
+ return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
+ else:
+ if self.info.has_key('security_label'):
+ old_label = self.info['security_label']
+ # Check label against expected one, unless wildcard
+ if old_label != old_seclab:
+ return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
- self.info['security_label'] = seclab
+ self.info['security_label'] = seclab
- try:
- xen.xend.XendDomain.instance().managed_config_save(self)
- except:
- pass
- return (rc, errors, old_label, new_ssidref)
+ try:
+ xen.xend.XendDomain.instance().managed_config_save(self)
+ except:
+ pass
+ return (rc, errors, old_label, new_ssidref)
+ finally:
+ xen.xend.XendDomain.instance().policy_lock.release()
def get_on_shutdown(self):
after_shutdown = self.info.get('actions_after_shutdown')
diff --git a/tools/python/xen/xend/osdep.py b/tools/python/xen/xend/osdep.py
index a026c85277..6636797b44 100644
--- a/tools/python/xen/xend/osdep.py
+++ b/tools/python/xen/xend/osdep.py
@@ -38,7 +38,10 @@ _vif_script = {
"SunOS": "vif-vnic"
}
-def _linux_balloon_stat(label):
+PROC_XEN_BALLOON = '/proc/xen/balloon'
+SYSFS_XEN_MEMORY = '/sys/devices/system/xen_memory/xen_memory0'
+
+def _linux_balloon_stat_proc(label):
"""Returns the value for the named label, or None if an error occurs."""
xend2linux_labels = { 'current' : 'Current allocation',
@@ -47,7 +50,6 @@ def _linux_balloon_stat(label):
'high-balloon' : 'High-mem balloon',
'limit' : 'Xen hard limit' }
- PROC_XEN_BALLOON = '/proc/xen/balloon'
f = file(PROC_XEN_BALLOON, 'r')
try:
for line in f:
@@ -62,6 +64,29 @@ def _linux_balloon_stat(label):
finally:
f.close()
+def _linux_balloon_stat_sysfs(label):
+ sysfiles = { 'target' : 'target_kb',
+ 'current' : 'info/current_kb',
+ 'low-balloon' : 'info/low_kb',
+ 'high-balloon' : 'info/high_kb',
+ 'limit' : 'info/hard_limit_kb' }
+
+ name = os.path.join(SYSFS_XEN_MEMORY, sysfiles[label])
+ f = file(name, 'r')
+
+ val = f.read().strip()
+ if val.isdigit():
+ return int(val)
+ return None
+
+def _linux_balloon_stat(label):
+ if os.access(PROC_XEN_BALLOON, os.F_OK):
+ return _linux_balloon_stat_proc(label)
+ elif os.access(SYSFS_XEN_MEMORY, os.F_OK):
+ return _linux_balloon_stat_sysfs(label)
+
+ return None
+
def _solaris_balloon_stat(label):
"""Returns the value for the named label, or None if an error occurs."""
diff --git a/tools/python/xen/xend/server/pciif.py b/tools/python/xen/xend/server/pciif.py
index d8df297f80..1051450a08 100644
--- a/tools/python/xen/xend/server/pciif.py
+++ b/tools/python/xen/xend/server/pciif.py
@@ -35,6 +35,8 @@ import resource
import re
from xen.xend.server.pciquirk import *
+from xen.xend.xenstore.xstransact import xstransact
+from xen.xend.xenstore.xswatch import xswatch
xc = xen.lowlevel.xc.xc()
@@ -58,6 +60,7 @@ def parse_hex(val):
class PciController(DevController):
def __init__(self, vm):
+ self.aerStateWatch = None
DevController.__init__(self, vm)
@@ -333,12 +336,6 @@ class PciController(DevController):
if rc<0:
raise VmError(('pci: failed to configure I/O memory on device '+
'%s - errno=%d')%(dev.name,rc))
- rc = xc.physdev_map_pirq(domid = fe_domid,
- index = dev.irq,
- pirq = dev.irq)
- if rc < 0:
- raise VmError(('pci: failed to map irq on device '+
- '%s - errno=%d')%(dev.name,rc))
if dev.msix:
for (start, size) in dev.msix_iomem:
@@ -353,6 +350,12 @@ class PciController(DevController):
if rc<0:
raise VmError(('pci: failed to remove msi-x iomem'))
+ rc = xc.physdev_map_pirq(domid = fe_domid,
+ index = dev.irq,
+ pirq = dev.irq)
+ if rc < 0:
+ raise VmError(('pci: failed to map irq on device '+
+ '%s - errno=%d')%(dev.name,rc))
if dev.irq>0:
log.debug('pci: enabling irq %d'%dev.irq)
rc = xc.domain_irq_permission(domid = fe_domid, pirq = dev.irq,
@@ -431,9 +434,23 @@ class PciController(DevController):
for (domain, bus, slot, func) in pci_dev_list:
self.setupOneDevice(domain, bus, slot, func)
-
+ wPath = '/local/domain/0/backend/pci/%u/0/aerState' % (self.getDomid())
+ self.aerStatePath = xswatch(wPath, self._handleAerStateWatch)
+ log.debug('pci: register aer watch %s', wPath)
return
+ def _handleAerStateWatch(self, _):
+ log.debug('XendDomainInfo.handleAerStateWatch')
+ if self.getDomid() == 0:
+ raise XendError('Domain 0 cannot be shutdown')
+ readPath = '/local/domain/0/backend/pci/%u/0/aerState' % (self.getDomid())
+ action = xstransact.Read(readPath)
+ if action and action=='aerfail':
+ log.debug('shutdown domain because of aer handle error')
+ self.vm.shutdown('poweroff')
+ return True
+
+
def cleanupOneDevice(self, domain, bus, slot, func):
""" Detach I/O resources for device from frontend domain
"""
@@ -545,6 +562,22 @@ class PciController(DevController):
return new_num_devs
+ def destroyDevice(self, devid, force):
+ DevController.destroyDevice(self, devid, True)
+ log.debug('pci: unregister aer watch')
+ self.unwatchAerState
+
+ def unwatchAerState(self):
+ """Remove the watch on the domain's aerState node, if any."""
+ try:
+ try:
+ if self.aerStateWatch:
+ self.aerStateWatch.unwatch()
+ finally:
+ self.aerStateWatch = None
+ except:
+ log.exception("Unwatching aerState failed.")
+
def waitForBackend(self,devid):
return (0, "ok - no hotplug")
diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py
index eb8f3e237c..3620e4968d 100644
--- a/tools/python/xen/xm/create.py
+++ b/tools/python/xen/xm/create.py
@@ -1202,8 +1202,9 @@ def make_domain(opts, config):
except:
server.xend.domain.destroy(dom)
err("Failed to unpause domain %s" % dom)
- opts.info("Started domain %s" % (dom))
- return int(sxp.child_value(dominfo, 'domid'))
+ domid = int(sxp.child_value(dominfo, 'domid'))
+ opts.info("Started domain %s (id=%d)" % (dom, domid))
+ return domid
def get_xauthority():
diff --git a/unmodified_drivers/linux-2.6/balloon/Kbuild b/unmodified_drivers/linux-2.6/balloon/Kbuild
index bcc8b05207..316592d83a 100644
--- a/unmodified_drivers/linux-2.6/balloon/Kbuild
+++ b/unmodified_drivers/linux-2.6/balloon/Kbuild
@@ -4,6 +4,5 @@ obj-m = xen-balloon.o
EXTRA_CFLAGS += -I$(M)/platform-pci
-xen-balloon-objs =
-xen-balloon-objs += balloon.o
-xen-balloon-objs += sysfs.o
+xen-balloon-y := balloon.o sysfs.o
+xen-balloon-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o
diff --git a/unmodified_drivers/linux-2.6/mkbuildtree b/unmodified_drivers/linux-2.6/mkbuildtree
index 9d0f04907e..3c1c799c85 100755
--- a/unmodified_drivers/linux-2.6/mkbuildtree
+++ b/unmodified_drivers/linux-2.6/mkbuildtree
@@ -53,6 +53,7 @@ i[34567]86|x86_64)
ln -sf ${XL}/include/asm-x86/mach-xen/asm/synch_bitops*.h include/asm
ln -sf ${XL}/include/asm-x86/mach-xen/asm/maddr*.h include/asm
ln -sf ${XL}/include/asm-x86/mach-xen/asm/gnttab_dma.h include/asm
+ ln -sf ${XL}/arch/x86/lib/scrub.c balloon
else
if [ $uname = x86_64 ]; then
mkdir -p include/asm-i386
diff --git a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c
index 08bf645ef7..ad667128a2 100644
--- a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c
+++ b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c
@@ -34,7 +34,11 @@ static void ap_suspend(void *_info)
atomic_dec(&info->nr_spinning);
}
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
#define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0, 0)
+#else
+#define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0)
+#endif
#else /* !defined(CONFIG_SMP) */
diff --git a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c
index 2b35c5c757..e4a766a909 100644
--- a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c
+++ b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c
@@ -14,7 +14,11 @@ EXPORT_SYMBOL(system_state);
void ctrl_alt_del(void)
{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
kill_proc(1, SIGINT, 1); /* interrupt init */
+#else
+ kill_cad_pid(SIGINT, 1);
+#endif
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
diff --git a/xen/Rules.mk b/xen/Rules.mk
index 36292c92e5..43a4f63249 100644
--- a/xen/Rules.mk
+++ b/xen/Rules.mk
@@ -69,6 +69,9 @@ CFLAGS-$(frame_pointer) += -fno-omit-frame-pointer -DCONFIG_FRAME_POINTER
ifneq ($(max_phys_cpus),)
CFLAGS-y += -DMAX_PHYS_CPUS=$(max_phys_cpus)
endif
+ifneq ($(max_phys_irqs),)
+CFLAGS-y += -DMAX_PHYS_IRQS=$(max_phys_irqs)
+endif
AFLAGS-y += -D__ASSEMBLY__
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index eb9dd08f47..60ec345073 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
obj-y += apic.o
obj-y += bitops.o
obj-y += clear_page.o
+obj-y += copy_page.o
obj-y += compat.o
obj-y += delay.o
obj-y += dmi_scan.o
diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c
index 055e3d7b51..2ae8cc4157 100644
--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -601,7 +601,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count =
acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
- NR_IRQ_VECTORS);
+ MAX_IRQ_SOURCES);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing interrupt source overrides entry\n");
@@ -623,7 +623,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count =
acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
- NR_IRQ_VECTORS);
+ MAX_IRQ_SOURCES);
if (count < 0) {
printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
index 7e96bfc796..ccad2ea5ee 100644
--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -30,6 +30,8 @@
#include <acpi/cpufreq/cpufreq.h>
+uint32_t system_reset_counter = 1;
+
static char opt_acpi_sleep[20];
string_param("acpi_sleep", opt_acpi_sleep);
@@ -75,19 +77,47 @@ static void device_power_up(void)
static void freeze_domains(void)
{
struct domain *d;
+ struct vcpu *v;
+ rcu_read_lock(&domlist_read_lock);
for_each_domain ( d )
- if ( d->domain_id != 0 )
+ {
+ switch ( d->domain_id )
+ {
+ case 0:
+ for_each_vcpu ( d, v )
+ if ( v != current )
+ vcpu_pause(v);
+ break;
+ default:
domain_pause(d);
+ break;
+ }
+ }
+ rcu_read_unlock(&domlist_read_lock);
}
static void thaw_domains(void)
{
struct domain *d;
+ struct vcpu *v;
+ rcu_read_lock(&domlist_read_lock);
for_each_domain ( d )
- if ( d->domain_id != 0 )
+ {
+ switch ( d->domain_id )
+ {
+ case 0:
+ for_each_vcpu ( d, v )
+ if ( v != current )
+ vcpu_unpause(v);
+ break;
+ default:
domain_unpause(d);
+ break;
+ }
+ }
+ rcu_read_unlock(&domlist_read_lock);
}
static void acpi_sleep_prepare(u32 state)
@@ -163,6 +193,7 @@ static int enter_state(u32 state)
{
case ACPI_STATE_S3:
do_suspend_lowlevel();
+ system_reset_counter++;
break;
case ACPI_STATE_S5:
acpi_enter_sleep_state(ACPI_STATE_S5);
diff --git a/xen/arch/x86/copy_page.S b/xen/arch/x86/copy_page.S
new file mode 100644
index 0000000000..2fd3e533c6
--- /dev/null
+++ b/xen/arch/x86/copy_page.S
@@ -0,0 +1,66 @@
+#include <xen/config.h>
+#include <asm/page.h>
+
+#ifdef __i386__
+#define src_reg %esi
+#define dst_reg %edi
+#define WORD_SIZE 4
+#define tmp1_reg %eax
+#define tmp2_reg %edx
+#define tmp3_reg %ebx
+#define tmp4_reg %ebp
+#else
+#define src_reg %rsi
+#define dst_reg %rdi
+#define WORD_SIZE 8
+#define tmp1_reg %r8
+#define tmp2_reg %r9
+#define tmp3_reg %r10
+#define tmp4_reg %r11
+#endif
+
+ENTRY(copy_page_sse2)
+#ifdef __i386__
+ push %ebx
+ push %ebp
+ push %esi
+ push %edi
+ mov 6*4(%esp), src_reg
+ mov 5*4(%esp), dst_reg
+#endif
+ mov $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
+
+ prefetchnta 2*4*WORD_SIZE(src_reg)
+ mov (src_reg), tmp1_reg
+ mov WORD_SIZE(src_reg), tmp2_reg
+ mov 2*WORD_SIZE(src_reg), tmp3_reg
+ mov 3*WORD_SIZE(src_reg), tmp4_reg
+
+0: prefetchnta 3*4*WORD_SIZE(src_reg)
+1: add $4*WORD_SIZE, src_reg
+ movnti tmp1_reg, (dst_reg)
+ mov (src_reg), tmp1_reg
+ dec %ecx
+ movnti tmp2_reg, WORD_SIZE(dst_reg)
+ mov WORD_SIZE(src_reg), tmp2_reg
+ movnti tmp3_reg, 2*WORD_SIZE(dst_reg)
+ mov 2*WORD_SIZE(src_reg), tmp3_reg
+ movnti tmp4_reg, 3*WORD_SIZE(dst_reg)
+ lea 4*WORD_SIZE(dst_reg), dst_reg
+ mov 3*WORD_SIZE(src_reg), tmp4_reg
+ jg 0b
+ jpe 1b
+
+ movnti tmp1_reg, (dst_reg)
+ movnti tmp2_reg, WORD_SIZE(dst_reg)
+ movnti tmp3_reg, 2*WORD_SIZE(dst_reg)
+ movnti tmp4_reg, 3*WORD_SIZE(dst_reg)
+
+#ifdef __i386__
+ pop %edi
+ pop %esi
+ pop %ebp
+ pop %ebx
+#endif
+ sfence
+ ret
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 42c6dc1210..2fcdee836a 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -564,7 +564,10 @@ void __cpuinit cpu_init(void)
{
int cpu = smp_processor_id();
struct tss_struct *t = &init_tss[cpu];
- char gdt_load[10];
+ struct desc_ptr gdt_desc = {
+ .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
+ .limit = LAST_RESERVED_GDT_BYTE
+ };
if (cpu_test_and_set(cpu, cpu_initialized)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -578,9 +581,7 @@ void __cpuinit cpu_init(void)
/* Install correct page table. */
write_ptbase(current);
- *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
- *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current);
- asm volatile ( "lgdt %0" : "=m" (gdt_load) );
+ asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
/* No nested task. */
asm volatile ("pushf ; andw $0xbfff,(%"__OP"sp) ; popf" );
diff --git a/xen/arch/x86/cpu/mcheck/p4.c b/xen/arch/x86/cpu/mcheck/p4.c
index 0b82e65196..65daca06ef 100644
--- a/xen/arch/x86/cpu/mcheck/p4.c
+++ b/xen/arch/x86/cpu/mcheck/p4.c
@@ -51,7 +51,7 @@ static void intel_thermal_interrupt(struct cpu_user_regs *regs)
ack_APIC_irq();
- if (NOW() > next[cpu])
+ if (NOW() < next[cpu])
return;
next[cpu] = NOW() + MILLISECS(5000);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index b24f04d2bb..6e6e9ebc21 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -184,7 +184,8 @@ static int setup_compat_l4(struct vcpu *v)
/* This page needs to look like a pagetable so that it can be shadowed */
pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
- l4tab = copy_page(page_to_virt(pg), idle_pg_table);
+ l4tab = page_to_virt(pg);
+ copy_page(l4tab, idle_pg_table);
l4tab[0] = l4e_empty();
l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
l4e_from_page(pg, __PAGE_HYPERVISOR);
@@ -310,12 +311,7 @@ int vcpu_initialise(struct vcpu *v)
if ( is_idle_domain(d) )
{
v->arch.schedule_tail = continue_idle_domain;
- if ( v->vcpu_id )
- v->arch.cr3 = d->vcpu[0]->arch.cr3;
- else if ( !*idle_vcpu )
- v->arch.cr3 = __pa(idle_pg_table);
- else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
- return -ENOMEM;
+ v->arch.cr3 = __pa(idle_pg_table);
}
v->arch.guest_context.ctrlreg[4] =
@@ -1172,14 +1168,18 @@ static void paravirt_ctxt_switch_to(struct vcpu *v)
}
}
+static inline int need_full_gdt(struct vcpu *v)
+{
+ return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
+}
+
static void __context_switch(void)
{
struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
- unsigned int i, cpu = smp_processor_id();
+ unsigned int cpu = smp_processor_id();
struct vcpu *p = per_cpu(curr_vcpu, cpu);
struct vcpu *n = current;
struct desc_struct *gdt;
- struct page_info *page;
struct desc_ptr gdt_desc;
ASSERT(p != n);
@@ -1208,16 +1208,19 @@ static void __context_switch(void)
gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
per_cpu(compat_gdt_table, cpu);
- page = virt_to_page(gdt);
- for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+ if ( need_full_gdt(n) )
{
- l1e_write(n->domain->arch.mm_perdomain_pt +
- (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE + i,
- l1e_from_page(page + i, __PAGE_HYPERVISOR));
+ struct page_info *page = virt_to_page(gdt);
+ unsigned int i;
+ for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
+ l1e_write(n->domain->arch.mm_perdomain_pt +
+ (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+ FIRST_RESERVED_GDT_PAGE + i,
+ l1e_from_page(page + i, __PAGE_HYPERVISOR));
}
- if ( p->vcpu_id != n->vcpu_id )
+ if ( need_full_gdt(p) &&
+ ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
{
gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
@@ -1226,8 +1229,10 @@ static void __context_switch(void)
write_ptbase(n);
- if ( p->vcpu_id != n->vcpu_id )
+ if ( need_full_gdt(n) &&
+ ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
{
+ gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
gdt_desc.base = GDT_VIRT_START(n);
asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
}
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index e026662b02..1c140c5799 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -194,30 +194,6 @@ static void __init process_dom0_ioports_disable(void)
}
}
-/* We run on dom0's page tables for the final part of the build process. */
-static void dom0_pt_enter(struct vcpu *v)
-{
- struct desc_ptr gdt_desc = {
- .limit = LAST_RESERVED_GDT_BYTE,
- .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)
- };
-
- asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
- write_ptbase(v);
-}
-
-/* Return to idle domain's page tables. */
-static void dom0_pt_exit(void)
-{
- struct desc_ptr gdt_desc = {
- .limit = LAST_RESERVED_GDT_BYTE,
- .base = GDT_VIRT_START(current)
- };
-
- write_ptbase(current);
- asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
-}
-
int __init construct_dom0(
struct domain *d,
unsigned long _image_start, unsigned long image_len,
@@ -479,8 +455,9 @@ int __init construct_dom0(
/* WARNING: The new domain must have its 'processor' field filled in! */
l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
- memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
+ copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
+ idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
@@ -729,7 +706,8 @@ int __init construct_dom0(
else
update_cr3(v);
- dom0_pt_enter(v);
+ /* We run on dom0's page tables for the final part of the build process. */
+ write_ptbase(v);
/* Copy the OS image and free temporary buffer. */
elf.dest = (void*)vkern_start;
@@ -741,11 +719,11 @@ int __init construct_dom0(
(parms.virt_hypercall >= v_end) )
{
write_ptbase(current);
- local_irq_enable();
printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
return -1;
}
- hypercall_page_initialise(d, (void *)(unsigned long)parms.virt_hypercall);
+ hypercall_page_initialise(
+ d, (void *)(unsigned long)parms.virt_hypercall);
}
/* Copy the initial ramdisk. */
@@ -826,7 +804,8 @@ int __init construct_dom0(
xlat_start_info(si, XLAT_start_info_console_dom0);
#endif
- dom0_pt_exit();
+ /* Return to idle domain's page tables. */
+ write_ptbase(current);
#if defined(__i386__)
/* Destroy low mappings - they were only for our convenience. */
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index a145583137..7531cfddff 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1074,11 +1074,24 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
if ( is_hvm_vcpu(v) )
{
+ struct segment_register sreg;
memset(c.nat->ctrlreg, 0, sizeof(c.nat->ctrlreg));
c.nat->ctrlreg[0] = v->arch.hvm_vcpu.guest_cr[0];
c.nat->ctrlreg[2] = v->arch.hvm_vcpu.guest_cr[2];
c.nat->ctrlreg[3] = v->arch.hvm_vcpu.guest_cr[3];
c.nat->ctrlreg[4] = v->arch.hvm_vcpu.guest_cr[4];
+ hvm_get_segment_register(v, x86_seg_cs, &sreg);
+ c.nat->user_regs.cs = sreg.sel;
+ hvm_get_segment_register(v, x86_seg_ss, &sreg);
+ c.nat->user_regs.ss = sreg.sel;
+ hvm_get_segment_register(v, x86_seg_ds, &sreg);
+ c.nat->user_regs.ds = sreg.sel;
+ hvm_get_segment_register(v, x86_seg_es, &sreg);
+ c.nat->user_regs.es = sreg.sel;
+ hvm_get_segment_register(v, x86_seg_fs, &sreg);
+ c.nat->user_regs.fs = sreg.sel;
+ hvm_get_segment_register(v, x86_seg_gs, &sreg);
+ c.nat->user_regs.gs = sreg.sel;
}
else
{
diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
index 83ab2f1887..d5e84f4ccd 100644
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -265,23 +265,20 @@ int hpet_legacy_irq_tick(void)
u64 hpet_setup(void)
{
static u64 hpet_rate;
- static int initialised;
+ static u32 system_reset_latch;
u32 hpet_id, hpet_period, cfg;
int i;
- if ( initialised )
+ if ( system_reset_latch == system_reset_counter )
return hpet_rate;
- initialised = 1;
-
- if ( hpet_address == 0 )
- return 0;
+ system_reset_latch = system_reset_counter;
set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
hpet_id = hpet_read32(HPET_ID);
- if ( hpet_id == 0 )
+ if ( (hpet_id & HPET_ID_REV) == 0 )
{
- printk("BAD HPET vendor id.\n");
+ printk("BAD HPET revision id.\n");
return 0;
}
@@ -299,9 +296,9 @@ u64 hpet_setup(void)
for ( i = 0; i <= ((hpet_id >> 8) & 31); i++ )
{
- cfg = hpet_read32(HPET_T0_CFG + i*0x20);
+ cfg = hpet_read32(HPET_Tn_CFG(i));
cfg &= ~HPET_TN_ENABLE;
- hpet_write32(cfg & ~HPET_TN_ENABLE, HPET_T0_CFG);
+ hpet_write32(cfg, HPET_Tn_CFG(i));
}
cfg = hpet_read32(HPET_CFG);
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index b87f953af1..9bda106ec2 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1884,6 +1884,25 @@ static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg)
return rc;
}
+static long hvm_vcpu_op(
+ int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+ long rc;
+
+ switch ( cmd )
+ {
+ case VCPUOP_register_runstate_memory_area:
+ case VCPUOP_get_runstate_info:
+ rc = do_vcpu_op(cmd, vcpuid, arg);
+ break;
+ default:
+ rc = -ENOSYS;
+ break;
+ }
+
+ return rc;
+}
+
typedef unsigned long hvm_hypercall_t(
unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
@@ -1895,6 +1914,7 @@ typedef unsigned long hvm_hypercall_t(
static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
[ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
[ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
+ [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
HYPERCALL(xen_version),
HYPERCALL(event_channel_op),
HYPERCALL(sched_op),
@@ -1911,9 +1931,29 @@ static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
return rc;
}
+static long hvm_vcpu_op_compat32(
+ int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+ long rc;
+
+ switch ( cmd )
+ {
+ case VCPUOP_register_runstate_memory_area:
+ case VCPUOP_get_runstate_info:
+ rc = compat_vcpu_op(cmd, vcpuid, arg);
+ break;
+ default:
+ rc = -ENOSYS;
+ break;
+ }
+
+ return rc;
+}
+
static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
[ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
[ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
+ [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
HYPERCALL(xen_version),
HYPERCALL(event_channel_op),
HYPERCALL(sched_op),
@@ -1923,6 +1963,7 @@ static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
[ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32,
[ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
+ [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op_compat32,
HYPERCALL(xen_version),
HYPERCALL(event_channel_op),
HYPERCALL(sched_op),
@@ -2081,7 +2122,7 @@ static int hvmop_set_pci_intx_level(
void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip)
{
- struct domain *d = current->domain;
+ struct domain *d = v->domain;
struct vcpu_guest_context *ctxt;
struct segment_register reg;
diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
index 77f31a6f7f..d6692d2b3d 100644
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -392,12 +392,16 @@ uint32_t get_pat_flags(struct vcpu *v,
*/
if ( pat_entry_value == INVALID_MEM_TYPE )
{
- gdprintk(XENLOG_WARNING,
- "Conflict occurs for a given guest l1e flags:%x "
- "at %"PRIx64" (the effective mm type:%d), "
- "because the host mtrr type is:%d\n",
- gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type,
- shadow_mtrr_type);
+ struct domain *d = v->domain;
+ p2m_type_t p2mt;
+ gfn_to_mfn(d, paddr_to_pfn(gpaddr), &p2mt);
+ if (p2m_is_ram(p2mt))
+ gdprintk(XENLOG_WARNING,
+ "Conflict occurs for a given guest l1e flags:%x "
+ "at %"PRIx64" (the effective mm type:%d), "
+ "because the host mtrr type is:%d\n",
+ gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type,
+ shadow_mtrr_type);
pat_entry_value = PAT_TYPE_UNCACHABLE;
}
/* 4. Get the pte flags */
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index c635f5204a..499cd619d4 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -739,6 +739,23 @@ static void svm_inject_exception(
struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
eventinj_t event = vmcb->eventinj;
+ switch ( trapnr )
+ {
+ case TRAP_debug:
+ if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
+ {
+ __restore_debug_registers(curr);
+ vmcb->dr6 |= 0x4000;
+ }
+ case TRAP_int3:
+ if ( curr->domain->debugger_attached )
+ {
+ /* Debug/Int3: Trap to debugger. */
+ domain_pause_for_debugger();
+ return;
+ }
+ }
+
if ( unlikely(event.fields.v) &&
(event.fields.type == X86_EVENTTYPE_HW_EXCEPTION) )
{
@@ -765,13 +782,6 @@ static void svm_inject_exception(
{
HVMTRACE_2D(INJ_EXC, trapnr, errcode);
}
-
- if ( (trapnr == TRAP_debug) &&
- (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
- {
- __restore_debug_registers(curr);
- vmcb->dr6 |= 0x4000;
- }
}
static int svm_event_pending(struct vcpu *v)
diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
index 7250de3a7d..7f63699ab2 100644
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -344,8 +344,8 @@ static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq)
}
else
#endif
- target = apic_round_robin(vioapic_domain(vioapic),
- vector, deliver_bitmask);
+ target = apic_lowest_prio(vioapic_domain(vioapic),
+ deliver_bitmask);
if ( target != NULL )
{
ioapic_inj_irq(vioapic, target, vector, trig_mode, delivery_mode);
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index d201af2848..68e9b27632 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -377,26 +377,30 @@ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode,
}
/* This function is used by both ioapic and lapic.The bitmap is for vcpu_id. */
-struct vlapic *apic_round_robin(
- struct domain *d, uint8_t vector, uint32_t bitmap)
+struct vlapic *apic_lowest_prio(struct domain *d, uint32_t bitmap)
{
- int next, old;
- struct vlapic *target = NULL;
+ int old = d->arch.hvm_domain.irq.round_robin_prev_vcpu;
+ uint32_t ppr, target_ppr = UINT_MAX;
+ struct vlapic *vlapic, *target = NULL;
+ struct vcpu *v;
- old = next = d->arch.hvm_domain.irq.round_robin_prev_vcpu;
+ if ( unlikely((v = d->vcpu[old]) == NULL) )
+ return NULL;
do {
- if ( ++next == MAX_VIRT_CPUS )
- next = 0;
- if ( (d->vcpu[next] == NULL) || !test_bit(next, &bitmap) )
- continue;
- target = vcpu_vlapic(d->vcpu[next]);
- if ( vlapic_enabled(target) )
- break;
- target = NULL;
- } while ( next != old );
+ v = v->next_in_list ? : d->vcpu[0];
+ vlapic = vcpu_vlapic(v);
+ if ( test_bit(v->vcpu_id, &bitmap) && vlapic_enabled(vlapic) &&
+ ((ppr = vlapic_get_ppr(vlapic)) < target_ppr) )
+ {
+ target = vlapic;
+ target_ppr = ppr;
+ }
+ } while ( v->vcpu_id != old );
- d->arch.hvm_domain.irq.round_robin_prev_vcpu = next;
+ if ( target != NULL )
+ d->arch.hvm_domain.irq.round_robin_prev_vcpu =
+ vlapic_vcpu(target)->vcpu_id;
return target;
}
@@ -456,7 +460,7 @@ int vlapic_ipi(
if ( delivery_mode == APIC_DM_LOWEST )
{
- target = apic_round_robin(vlapic_domain(v), vector, lpr_map);
+ target = apic_lowest_prio(vlapic_domain(v), lpr_map);
if ( target != NULL )
rc = vlapic_accept_irq(vlapic_vcpu(target), delivery_mode,
vector, level, trig_mode);
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 6eefb61bfa..cc9e9adde5 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -152,7 +152,7 @@ int vmsi_deliver(struct domain *d, int pirq)
{
case dest_LowestPrio:
{
- target = apic_round_robin(d, vector, deliver_bitmask);
+ target = apic_lowest_prio(d, deliver_bitmask);
if ( target != NULL )
vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode);
else
diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
index e62c8459f9..daf2e7bcf0 100644
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -140,12 +140,12 @@ asmlinkage void vmx_intr_assist(void)
if ( intack.source == hvm_intsrc_nmi )
{
- vmx_inject_nmi(v);
+ vmx_inject_nmi();
}
else
{
HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
- vmx_inject_extint(v, intack.vector);
+ vmx_inject_extint(intack.vector);
pt_intr_post(v, intack);
}
diff --git a/xen/arch/x86/hvm/vmx/realmode.c b/xen/arch/x86/hvm/vmx/realmode.c
index 5d13f4e60b..4af2848406 100644
--- a/xen/arch/x86/hvm/vmx/realmode.c
+++ b/xen/arch/x86/hvm/vmx/realmode.c
@@ -69,7 +69,8 @@ static void realmode_deliver_exception(
frame[1] = csr->sel;
frame[2] = regs->eflags & ~X86_EFLAGS_RF;
- if ( hvmemul_ctxt->ctxt.addr_size == 32 )
+ /* We can't test hvmemul_ctxt->ctxt.sp_size: it may not be initialised. */
+ if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db )
{
regs->esp -= 6;
pstk = regs->esp;
@@ -148,17 +149,25 @@ static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
hvmemul_ctxt->exn_insn_len = 0;
}
- if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE )
+ if ( unlikely(curr->domain->debugger_attached) &&
+ ((hvmemul_ctxt->exn_vector == TRAP_debug) ||
+ (hvmemul_ctxt->exn_vector == TRAP_int3)) )
+ {
+ domain_pause_for_debugger();
+ }
+ else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE )
{
gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n",
hvmemul_ctxt->exn_vector);
goto fail;
}
-
- realmode_deliver_exception(
- hvmemul_ctxt->exn_vector,
- hvmemul_ctxt->exn_insn_len,
- hvmemul_ctxt);
+ else
+ {
+ realmode_deliver_exception(
+ hvmemul_ctxt->exn_vector,
+ hvmemul_ctxt->exn_insn_len,
+ hvmemul_ctxt);
+ }
}
return;
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 8fdeb40008..794b9cdeea 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -444,6 +444,8 @@ static void vmx_set_host_env(struct vcpu *v)
{
unsigned int cpu = smp_processor_id();
+ __vmwrite(HOST_GDTR_BASE,
+ (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY));
__vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
__vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
@@ -541,9 +543,6 @@ static int construct_vmcs(struct vcpu *v)
__vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
__vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
- /* Host GDTR base. */
- __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v));
-
/* Host data selectors. */
__vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
__vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index fb12efe46a..4156f02ad1 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -187,7 +187,7 @@ static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
check_long_mode:
if ( !(hvm_long_mode_enabled(v)) )
{
- vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+ vmx_inject_hw_exception(TRAP_gp_fault, 0);
return HNDL_exception_raised;
}
break;
@@ -284,7 +284,7 @@ static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
uncanonical_address:
HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
gp_fault:
- vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+ vmx_inject_hw_exception(TRAP_gp_fault, 0);
exception_raised:
return HNDL_exception_raised;
}
@@ -1094,8 +1094,7 @@ void ept_sync_domain(struct domain *d)
}
}
-static void __vmx_inject_exception(
- struct vcpu *v, int trap, int type, int error_code)
+static void __vmx_inject_exception(int trap, int type, int error_code)
{
unsigned long intr_fields;
@@ -1114,17 +1113,29 @@ static void __vmx_inject_exception(
}
__vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
-
- if ( trap == TRAP_page_fault )
- HVMTRACE_LONG_2D(PF_INJECT, error_code,
- TRC_PAR_LONG(v->arch.hvm_vcpu.guest_cr[2]));
- else
- HVMTRACE_2D(INJ_EXC, trap, error_code);
}
-void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code)
+void vmx_inject_hw_exception(int trap, int error_code)
{
unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
+ struct vcpu *curr = current;
+
+ switch ( trap )
+ {
+ case TRAP_debug:
+ if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
+ {
+ __restore_debug_registers(curr);
+ write_debugreg(6, read_debugreg(6) | 0x4000);
+ }
+ case TRAP_int3:
+ if ( curr->domain->debugger_attached )
+ {
+ /* Debug/Int3: Trap to debugger. */
+ domain_pause_for_debugger();
+ return;
+ }
+ }
if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
(((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
@@ -1134,37 +1145,34 @@ void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code)
error_code = 0;
}
- __vmx_inject_exception(v, trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
+ __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
+
+ if ( trap == TRAP_page_fault )
+ HVMTRACE_LONG_2D(PF_INJECT, error_code,
+ TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
+ else
+ HVMTRACE_2D(INJ_EXC, trap, error_code);
}
-void vmx_inject_extint(struct vcpu *v, int trap)
+void vmx_inject_extint(int trap)
{
- __vmx_inject_exception(v, trap, X86_EVENTTYPE_EXT_INTR,
+ __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
HVM_DELIVER_NO_ERROR_CODE);
}
-void vmx_inject_nmi(struct vcpu *v)
+void vmx_inject_nmi(void)
{
- __vmx_inject_exception(v, 2, X86_EVENTTYPE_NMI,
+ __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
HVM_DELIVER_NO_ERROR_CODE);
}
static void vmx_inject_exception(
unsigned int trapnr, int errcode, unsigned long cr2)
{
- struct vcpu *curr = current;
-
- vmx_inject_hw_exception(curr, trapnr, errcode);
-
if ( trapnr == TRAP_page_fault )
- curr->arch.hvm_vcpu.guest_cr[2] = cr2;
+ current->arch.hvm_vcpu.guest_cr[2] = cr2;
- if ( (trapnr == TRAP_debug) &&
- (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
- {
- __restore_debug_registers(curr);
- write_debugreg(6, read_debugreg(6) | 0x4000);
- }
+ vmx_inject_hw_exception(trapnr, errcode);
}
static int vmx_event_pending(struct vcpu *v)
@@ -1315,7 +1323,7 @@ static void __update_guest_eip(unsigned long inst_len)
}
if ( regs->eflags & X86_EFLAGS_TF )
- vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
+ vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
}
static void vmx_fpu_dirty_intercept(void)
@@ -1636,7 +1644,6 @@ static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
{
u64 msr_content = 0;
u32 ecx = regs->ecx, eax, edx;
- struct vcpu *v = current;
HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
@@ -1712,7 +1719,7 @@ done:
return X86EMUL_OKAY;
gp_fault:
- vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+ vmx_inject_hw_exception(TRAP_gp_fault, 0);
return X86EMUL_EXCEPTION;
}
@@ -1849,7 +1856,7 @@ static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
if ( (rc < 0) ||
(vmx_add_host_load_msr(ecx) < 0) )
- vmx_inject_hw_exception(v, TRAP_machine_check, 0);
+ vmx_inject_hw_exception(TRAP_machine_check, 0);
else
{
__vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
@@ -1889,7 +1896,7 @@ static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
return X86EMUL_OKAY;
gp_fault:
- vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+ vmx_inject_hw_exception(TRAP_gp_fault, 0);
return X86EMUL_EXCEPTION;
}
@@ -2197,7 +2204,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
}
v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
- vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
+ vmx_inject_hw_exception(TRAP_page_fault, regs->error_code);
break;
case TRAP_nmi:
if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
@@ -2317,7 +2324,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
case EXIT_REASON_VMWRITE:
case EXIT_REASON_VMXOFF:
case EXIT_REASON_VMXON:
- vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+ vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
break;
case EXIT_REASON_TPR_BELOW_THRESHOLD:
@@ -2326,7 +2333,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
case EXIT_REASON_IO_INSTRUCTION:
case EXIT_REASON_APIC_ACCESS:
if ( !handle_mmio() )
- hvm_inject_exception(TRAP_gp_fault, 0, 0);
+ vmx_inject_hw_exception(TRAP_gp_fault, 0);
break;
case EXIT_REASON_INVD:
diff --git a/xen/arch/x86/hvm/vmx/vpmu_core2.c b/xen/arch/x86/hvm/vmx/vpmu_core2.c
index 9e0822ffdf..ff1783b2c0 100644
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c
@@ -335,7 +335,7 @@ static int core2_vpmu_do_wrmsr(struct cpu_user_regs *regs)
case MSR_CORE_PERF_GLOBAL_STATUS:
gdprintk(XENLOG_INFO, "Can not write readonly MSR: "
"MSR_PERF_GLOBAL_STATUS(0x38E)!\n");
- vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
+ vmx_inject_hw_exception(TRAP_gp_fault, 0);
return 1;
case MSR_IA32_PEBS_ENABLE:
if ( msr_content & 1 )
diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c
index 555d937c4d..79317085b7 100644
--- a/xen/arch/x86/i8259.c
+++ b/xen/arch/x86/i8259.c
@@ -390,7 +390,7 @@ void __init init_IRQ(void)
init_8259A(0);
- for ( i = 0; i < NR_IRQS; i++ )
+ for ( i = 0; i < NR_VECTORS; i++ )
{
irq_desc[i].status = IRQ_DISABLED;
irq_desc[i].handler = &no_irq_type;
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index aa21f18104..15d2b6b851 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -665,7 +665,7 @@ static inline int IO_APIC_irq_trigger(int irq)
}
/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
+u8 irq_vector[NR_IRQS] __read_mostly;
int free_irq_vector(int vector)
{
@@ -686,7 +686,7 @@ int assign_irq_vector(int irq)
static unsigned current_vector = FIRST_DYNAMIC_VECTOR;
unsigned vector;
- BUG_ON(irq >= NR_IRQ_VECTORS);
+ BUG_ON(irq >= NR_IRQS);
spin_lock(&vector_lock);
@@ -1547,20 +1547,10 @@ static struct hw_interrupt_type ioapic_level_type = {
.set_affinity = set_ioapic_affinity_vector,
};
-static void mask_msi_vector(unsigned int vector)
-{
- mask_msi_irq(vector);
-}
-
-static void unmask_msi_vector(unsigned int vector)
-{
- unmask_msi_irq(vector);
-}
-
static unsigned int startup_msi_vector(unsigned int vector)
{
dprintk(XENLOG_INFO, "startup msi vector %x\n", vector);
- unmask_msi_irq(vector);
+ unmask_msi_vector(vector);
return 0;
}
@@ -1576,13 +1566,13 @@ static void end_msi_vector(unsigned int vector)
static void shutdown_msi_vector(unsigned int vector)
{
dprintk(XENLOG_INFO, "shutdown msi vector %x\n", vector);
- mask_msi_irq(vector);
+ mask_msi_vector(vector);
}
static void set_msi_affinity_vector(unsigned int vector, cpumask_t cpu_mask)
{
set_native_irq_info(vector, cpu_mask);
- set_msi_irq_affinity(vector, cpu_mask);
+ set_msi_affinity(vector, cpu_mask);
}
/*
@@ -2196,7 +2186,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
if ( new_rte.vector >= FIRST_DYNAMIC_VECTOR )
new_irq = vector_irq[new_rte.vector];
- if ( (old_irq != new_irq) && (old_irq != -1) && IO_APIC_IRQ(old_irq) )
+ if ( (old_irq != new_irq) && (old_irq >= 0) && IO_APIC_IRQ(old_irq) )
{
if ( irq_desc[IO_APIC_VECTOR(old_irq)].action )
{
@@ -2208,7 +2198,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
remove_pin_at_irq(old_irq, apic, pin);
}
- if ( (new_irq != -1) && IO_APIC_IRQ(new_irq) )
+ if ( (new_irq >= 0) && IO_APIC_IRQ(new_irq) )
{
if ( irq_desc[IO_APIC_VECTOR(new_irq)].action )
{
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index efb73ad011..4e3bed2228 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -24,7 +24,7 @@
int opt_noirqbalance = 0;
boolean_param("noirqbalance", opt_noirqbalance);
-irq_desc_t irq_desc[NR_IRQS];
+irq_desc_t irq_desc[NR_VECTORS];
static void __do_IRQ_guest(int vector);
@@ -206,7 +206,7 @@ struct pending_eoi {
static DEFINE_PER_CPU(struct pending_eoi, pending_eoi[NR_VECTORS]);
#define pending_eoi_sp(p) ((p)[NR_VECTORS-1].vector)
-static struct timer irq_guest_eoi_timer[NR_IRQS];
+static struct timer irq_guest_eoi_timer[NR_VECTORS];
static void irq_guest_eoi_timer_fn(void *data)
{
irq_desc_t *desc = data;
@@ -463,14 +463,19 @@ int pirq_acktype(struct domain *d, int irq)
/*
* Edge-triggered IO-APIC and LAPIC interrupts need no final
* acknowledgement: we ACK early during interrupt processing.
- * MSIs are treated as edge-triggered interrupts.
*/
if ( !strcmp(desc->handler->typename, "IO-APIC-edge") ||
- !strcmp(desc->handler->typename, "local-APIC-edge") ||
- !strcmp(desc->handler->typename, "PCI-MSI") )
+ !strcmp(desc->handler->typename, "local-APIC-edge") )
return ACKTYPE_NONE;
/*
+ * MSIs are treated as edge-triggered interrupts, except
+ * when there is no proper way to mask them.
+ */
+ if ( desc->handler == &pci_msi_type )
+ return msi_maskable_irq(desc->msi_desc) ? ACKTYPE_NONE : ACKTYPE_EOI;
+
+ /*
* Level-triggered IO-APIC interrupts need to be acknowledged on the CPU
* on which they were received. This is because we tickle the LAPIC to EOI.
*/
@@ -765,15 +770,15 @@ int get_free_pirq(struct domain *d, int type, int index)
if ( type == MAP_PIRQ_TYPE_GSI )
{
- for ( i = 16; i < NR_PIRQS; i++ )
+ for ( i = 16; i < NR_IRQS; i++ )
if ( !d->arch.pirq_vector[i] )
break;
- if ( i == NR_PIRQS )
+ if ( i == NR_IRQS )
return -ENOSPC;
}
else
{
- for ( i = NR_PIRQS - 1; i >= 16; i-- )
+ for ( i = NR_IRQS - 1; i >= 16; i-- )
if ( !d->arch.pirq_vector[i] )
break;
if ( i == 16 )
@@ -800,7 +805,7 @@ int map_domain_pirq(
if ( !IS_PRIV(current->domain) )
return -EPERM;
- if ( pirq < 0 || pirq >= NR_PIRQS || vector < 0 || vector >= NR_VECTORS )
+ if ( pirq < 0 || pirq >= NR_IRQS || vector < 0 || vector >= NR_VECTORS )
{
dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or vector %d\n",
d->domain_id, pirq, vector);
@@ -857,7 +862,7 @@ int unmap_domain_pirq(struct domain *d, int pirq)
int vector, ret = 0;
bool_t forced_unbind;
- if ( (pirq < 0) || (pirq >= NR_PIRQS) )
+ if ( (pirq < 0) || (pirq >= NR_IRQS) )
return -EINVAL;
if ( !IS_PRIV(current->domain) )
@@ -921,7 +926,7 @@ void free_domain_pirqs(struct domain *d)
spin_lock(&d->event_lock);
- for ( i = 0; i < NR_PIRQS; i++ )
+ for ( i = 0; i < NR_IRQS; i++ )
if ( d->arch.pirq_vector[i] > 0 )
unmap_domain_pirq(d, i);
@@ -1001,28 +1006,30 @@ __initcall(setup_dump_irqs);
void fixup_irqs(cpumask_t map)
{
- unsigned int irq, sp;
+ unsigned int vector, sp;
static int warned;
irq_guest_action_t *action;
struct pending_eoi *peoi;
/* Direct all future interrupts away from this CPU. */
- for ( irq = 0; irq < NR_IRQS; irq++ )
+ for ( vector = 0; vector < NR_VECTORS; vector++ )
{
cpumask_t mask;
- if ( irq == 2 )
+ if ( vector_to_irq(vector) == 2 )
continue;
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpus_and(mask, irq_desc[vector].affinity, map);
if ( any_online_cpu(mask) == NR_CPUS )
{
- printk("Breaking affinity for irq %i\n", irq);
+ printk("Breaking affinity for vector %u (irq %i)\n",
+ vector, vector_to_irq(vector));
mask = map;
}
- if ( irq_desc[irq].handler->set_affinity )
- irq_desc[irq].handler->set_affinity(irq, mask);
- else if ( irq_desc[irq].action && !(warned++) )
- printk("Cannot set affinity for irq %i\n", irq);
+ if ( irq_desc[vector].handler->set_affinity )
+ irq_desc[vector].handler->set_affinity(vector, mask);
+ else if ( irq_desc[vector].action && !(warned++) )
+ printk("Cannot set affinity for irq %u (irq %i)\n",
+ vector, vector_to_irq(vector));
}
/* Service any interrupts that beat us in the re-direction race. */
@@ -1031,11 +1038,11 @@ void fixup_irqs(cpumask_t map)
local_irq_disable();
/* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */
- for ( irq = 0; irq < NR_IRQS; irq++ )
+ for ( vector = 0; vector < NR_VECTORS; vector++ )
{
- if ( !(irq_desc[irq].status & IRQ_GUEST) )
+ if ( !(irq_desc[vector].status & IRQ_GUEST) )
continue;
- action = (irq_guest_action_t *)irq_desc[irq].action;
+ action = (irq_guest_action_t *)irq_desc[vector].action;
cpu_clear(smp_processor_id(), action->cpu_eoi_map);
}
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 97359e7e94..2adc1ed2c7 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -160,6 +160,9 @@ unsigned long total_pages;
#define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
+int opt_allow_hugepage;
+boolean_param("allowhugepage", opt_allow_hugepage);
+
#define l1_disallow_mask(d) \
((d != dom_io) && \
(rangeset_is_empty((d)->iomem_caps) && \
@@ -586,6 +589,28 @@ static int get_page_and_type_from_pagenr(unsigned long page_nr,
return rc;
}
+static int get_data_page(
+ struct page_info *page, struct domain *d, int writeable)
+{
+ int rc;
+
+ if ( writeable )
+ rc = get_page_and_type(page, d, PGT_writable_page);
+ else
+ rc = get_page(page, d);
+
+ return rc;
+}
+
+static void put_data_page(
+ struct page_info *page, int writeable)
+{
+ if ( writeable )
+ put_page_and_type(page);
+ else
+ put_page(page);
+}
+
/*
* We allow root tables to map each other (a.k.a. linear page tables). It
* needs some special care with reference counts and access permissions:
@@ -700,10 +725,9 @@ get_page_from_l1e(
* contribute to writeable mapping refcounts. (This allows the
* qemu-dm helper process in dom0 to map the domain's memory without
* messing up the count of "real" writable mappings.) */
- okay = (((l1f & _PAGE_RW) &&
- !(unlikely(paging_mode_external(d) && (d != curr->domain))))
- ? get_page_and_type(page, d, PGT_writable_page)
- : get_page(page, d));
+ okay = get_data_page(
+ page, d,
+ (l1f & _PAGE_RW) && !(paging_mode_external(d) && (d != curr->domain)));
if ( !okay )
{
MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
@@ -751,6 +775,7 @@ static int
get_page_from_l2e(
l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
{
+ unsigned long mfn = l2e_get_pfn(l2e);
int rc;
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
@@ -762,10 +787,37 @@ get_page_from_l2e(
return -EINVAL;
}
- rc = get_page_and_type_from_pagenr(
- l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
- if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
- rc = 0;
+ if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
+ {
+ rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
+ if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+ rc = 0;
+ }
+ else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
+ {
+ rc = -EINVAL;
+ }
+ else
+ {
+ unsigned long m = mfn;
+ int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
+
+ do {
+ rc = get_data_page(mfn_to_page(m), d, writeable);
+ if ( unlikely(!rc) )
+ {
+ while ( m-- > mfn )
+ put_data_page(mfn_to_page(m), writeable);
+ return -EINVAL;
+ }
+ } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
+
+#ifdef __x86_64__
+ map_pages_to_xen(
+ (unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES,
+ PAGE_HYPERVISOR | l2e_get_flags(l2e));
+#endif
+ }
return rc;
}
@@ -954,13 +1006,24 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
*/
static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
{
- if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
- (l2e_get_pfn(l2e) != pfn) )
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
+ return 1;
+
+ if ( l2e_get_flags(l2e) & _PAGE_PSE )
+ {
+ unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
+ int writeable = l2e_get_flags(l2e) & _PAGE_RW;
+ ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1)));
+ do {
+ put_data_page(mfn_to_page(m), writeable);
+ } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
+ }
+ else
{
put_page_and_type(l2e_get_page(l2e));
- return 0;
}
- return 1;
+
+ return 0;
}
static int __put_page_type(struct page_info *, int preemptible);
@@ -1541,6 +1604,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
struct domain *d = curr->domain;
unsigned long mfn;
struct page_info *l1pg = mfn_to_page(gl1mfn);
+ p2m_type_t p2mt;
int rc = 1;
page_lock(l1pg);
@@ -1558,8 +1622,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
{
/* Translate foreign guest addresses. */
- mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
- if ( unlikely(mfn == INVALID_MFN) )
+ mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt));
+ if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
return page_unlock(l1pg), 0;
ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
@@ -3332,6 +3396,10 @@ int create_grant_host_mapping(uint64_t addr, unsigned long frame,
if ( !(flags & GNTMAP_readonly) )
l1e_add_flags(pte,_PAGE_RW);
+ l1e_add_flags(pte,
+ ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
+ & _PAGE_AVAIL);
+
l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
if ( flags & GNTMAP_contains_pte )
@@ -4227,7 +4295,7 @@ int map_pages_to_xen(
{
if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
flush_flags |= FLUSH_TLB_GLOBAL;
- if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
+ if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
PAGE_CACHE_ATTRS )
flush_flags |= FLUSH_CACHE;
flush_area(virt, flush_flags);
diff --git a/xen/arch/x86/mm/Makefile b/xen/arch/x86/mm/Makefile
index 79b25962ac..a399c8ed14 100644
--- a/xen/arch/x86/mm/Makefile
+++ b/xen/arch/x86/mm/Makefile
@@ -3,3 +3,9 @@ subdir-y += hap
obj-y += paging.o
obj-y += p2m.o
+obj-y += guest_walk_2.o
+obj-y += guest_walk_3.o
+obj-$(x86_64) += guest_walk_4.o
+
+guest_walk_%.o: guest_walk.c $(HDRS) Makefile
+ $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
new file mode 100644
index 0000000000..5d532800aa
--- /dev/null
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -0,0 +1,260 @@
+/******************************************************************************
+ * arch/x86/mm/guest_walk.c
+ *
+ * Pagetable walker for guest memory accesses.
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/page.h>
+#include <asm/guest_pt.h>
+
+
+/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
+static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
+{
+ static uint32_t flags[] = {
+ /* I/F - Usr Wr */
+ /* 0 0 0 0 */ _PAGE_PRESENT,
+ /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
+ /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
+ /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+ /* 0 1 0 0 */ _PAGE_PRESENT,
+ /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
+ /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
+ /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+ /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
+ /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+ /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+ /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+ /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
+ /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+ /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+ /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+ };
+
+ /* Don't demand not-NX if the CPU wouldn't enforce it. */
+ if ( !guest_supports_nx(v) )
+ pfec &= ~PFEC_insn_fetch;
+
+ /* Don't demand R/W if the CPU wouldn't enforce it. */
+ if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
+ && !(pfec & PFEC_user_mode) )
+ pfec &= ~PFEC_write_access;
+
+ return flags[(pfec & 0x1f) >> 1];
+}
+
+/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
+ * Returns non-zero if it actually writes to guest memory. */
+static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
+{
+ guest_intpte_t old, new;
+
+ old = *(guest_intpte_t *)walk_p;
+ new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
+ if ( old != new )
+ {
+ /* Write the new entry into the walk, and try to write it back
+ * into the guest table as well. If the guest table has changed
+ * under out feet then leave it alone. */
+ *(guest_intpte_t *)walk_p = new;
+ if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
+ return 1;
+ }
+ return 0;
+}
+
+
+/* Walk the guest pagetables, after the manner of a hardware walker. */
+uint32_t
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw,
+ uint32_t pfec, mfn_t top_mfn, void *top_map)
+{
+ struct domain *d = v->domain;
+ p2m_type_t p2mt;
+ guest_l1e_t *l1p = NULL;
+ guest_l2e_t *l2p = NULL;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ guest_l3e_t *l3p = NULL;
+ guest_l4e_t *l4p;
+#endif
+ uint32_t gflags, mflags, rc = 0;
+ int pse;
+
+ perfc_incr(guest_walk);
+ memset(gw, 0, sizeof(*gw));
+ gw->va = va;
+
+ /* Mandatory bits that must be set in every entry. We invert NX, to
+ * calculate as if there were an "X" bit that allowed access.
+ * We will accumulate, in rc, the set of flags that are missing. */
+ mflags = mandatory_flags(v, pfec);
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+
+ /* Get the l4e from the top level table and check its flags*/
+ gw->l4mfn = top_mfn;
+ l4p = (guest_l4e_t *) top_map;
+ gw->l4e = l4p[guest_l4_table_offset(va)];
+ gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ if ( rc & _PAGE_PRESENT ) goto out;
+
+ /* Map the l3 table */
+ gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
+ if ( !p2m_is_ram(p2mt) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+ ASSERT(mfn_valid(mfn_x(gw->l3mfn)));
+
+ /* Get the l3e and check its flags*/
+ l3p = map_domain_page(mfn_x(gw->l3mfn));
+ gw->l3e = l3p[guest_l3_table_offset(va)];
+ gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ if ( rc & _PAGE_PRESENT )
+ goto out;
+
+#else /* PAE only... */
+
+ /* Get the l3e and check its flag */
+ gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
+ if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+
+#endif /* PAE or 64... */
+
+ /* Map the l2 table */
+ gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
+ if ( !p2m_is_ram(p2mt) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+ ASSERT(mfn_valid(mfn_x(gw->l2mfn)));
+
+ /* Get the l2e */
+ l2p = map_domain_page(mfn_x(gw->l2mfn));
+ gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#else /* 32-bit only... */
+
+ /* Get l2e from the top level table */
+ gw->l2mfn = top_mfn;
+ l2p = (guest_l2e_t *) top_map;
+ gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#endif /* All levels... */
+
+ gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ if ( rc & _PAGE_PRESENT )
+ goto out;
+
+ pse = (guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
+
+ if ( pse )
+ {
+ /* Special case: this guest VA is in a PSE superpage, so there's
+ * no guest l1e. We make one up so that the propagation code
+ * can generate a shadow l1 table. Start with the gfn of the
+ * first 4k-page of the superpage. */
+ gfn_t start = guest_l2e_get_gfn(gw->l2e);
+ /* Grant full access in the l1e, since all the guest entry's
+ * access controls are enforced in the shadow l2e. */
+ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY);
+ /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+ * of the level 1. */
+ if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
+ flags |= _PAGE_PAT;
+ /* Copy the cache-control bits to the l1 as well, because we
+ * can't represent PAT in the (non-PSE) shadow l2e. :(
+ * This could cause problems if a guest ever maps an area of
+ * memory with superpages using more than one caching mode. */
+ flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
+ /* Increment the pfn by the right number of 4k pages.
+ * The ~0x1 is to mask out the PAT bit mentioned above. */
+ start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+ gw->l1e = guest_l1e_from_gfn(start, flags);
+ gw->l1mfn = _mfn(INVALID_MFN);
+ }
+ else
+ {
+ /* Not a superpage: carry on and find the l1e. */
+ gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
+ if ( !p2m_is_ram(p2mt) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+ ASSERT(mfn_valid(mfn_x(gw->l1mfn)));
+ l1p = map_domain_page(mfn_x(gw->l1mfn));
+ gw->l1e = l1p[guest_l1_table_offset(va)];
+ gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ }
+
+ /* Go back and set accessed and dirty bits only if the walk was a
+ * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
+ * get set whenever a lower-level PT is used, at least some hardware
+ * walkers behave this way. */
+ if ( rc == 0 )
+ {
+#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
+ if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
+ paging_mark_dirty(d, mfn_x(gw->l4mfn));
+ if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
+ paging_mark_dirty(d, mfn_x(gw->l3mfn));
+#endif
+ if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
+ (pse && (pfec & PFEC_write_access))) )
+ paging_mark_dirty(d, mfn_x(gw->l2mfn));
+ if ( !pse )
+ {
+ if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
+ (pfec & PFEC_write_access)) )
+ paging_mark_dirty(d, mfn_x(gw->l1mfn));
+ }
+ }
+
+ out:
+#if GUEST_PAGING_LEVELS == 4
+ if ( l3p ) unmap_domain_page(l3p);
+#endif
+#if GUEST_PAGING_LEVELS >= 3
+ if ( l2p ) unmap_domain_page(l2p);
+#endif
+ if ( l1p ) unmap_domain_page(l1p);
+
+ return rc;
+}
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index f1c54983d7..425031508d 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -19,160 +19,71 @@
* Place - Suite 330, Boston, MA 02111-1307 USA.
*/
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
+
#include <xen/domain_page.h>
-#include <asm/page.h>
-#include <xen/event.h>
+#include <xen/paging.h>
+#include <xen/config.h>
#include <xen/sched.h>
-#include <asm/hvm/svm/vmcb.h>
-#include <asm/domain.h>
-#include <asm/paging.h>
-#include <asm/p2m.h>
-#include <asm/hap.h>
-
-#include "private.h"
#define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##level
#define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
-#if GUEST_PAGING_LEVELS > CONFIG_PAGING_LEVELS
+#if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
-unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
-{
- gdprintk(XENLOG_ERR,
- "Guest paging level is greater than host paging level!\n");
- domain_crash(v->domain);
- return INVALID_GFN;
-}
-
-#else
-
-#if GUEST_PAGING_LEVELS == 2
-#include "../page-guest32.h"
-#define l1_pgentry_t l1_pgentry_32_t
-#define l2_pgentry_t l2_pgentry_32_t
-#undef l2e_get_flags
-#define l2e_get_flags(x) l2e_get_flags_32(x)
-#undef l1e_get_flags
-#define l1e_get_flags(x) l1e_get_flags_32(x)
-#endif
+#include <asm/guest_pt.h>
unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
struct vcpu *v, unsigned long gva, uint32_t *pfec)
{
- unsigned long gcr3 = v->arch.hvm_vcpu.guest_cr[3];
- int mode = GUEST_PAGING_LEVELS;
- int lev, index;
- paddr_t gpa = 0;
- unsigned long gpfn, mfn;
+ unsigned long cr3;
+ uint32_t missing;
+ mfn_t top_mfn;
+ void *top_map;
p2m_type_t p2mt;
- int success = 1;
+ walk_t gw;
- l1_pgentry_t *l1e;
- l2_pgentry_t *l2e;
-#if GUEST_PAGING_LEVELS >= 3
- l3_pgentry_t *l3e;
-#endif
-#if GUEST_PAGING_LEVELS >= 4
- l4_pgentry_t *l4e;
-#endif
-
- gpfn = (gcr3 >> PAGE_SHIFT);
- for ( lev = mode; lev >= 1; lev-- )
+ /* Get the top-level table's MFN */
+ cr3 = v->arch.hvm_vcpu.guest_cr[3];
+ top_mfn = gfn_to_mfn(v->domain, _gfn(cr3 >> PAGE_SHIFT), &p2mt);
+ if ( !p2m_is_ram(p2mt) )
{
- mfn = mfn_x(gfn_to_mfn_current(gpfn, &p2mt));
- if ( !p2m_is_ram(p2mt) )
- {
- HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva,
- lev);
- success = 0;
- break;
- }
- ASSERT(mfn_valid(mfn));
-
- index = (gva >> PT_SHIFT[mode][lev]) & (PT_ENTRIES[mode][lev]-1);
-
-#if GUEST_PAGING_LEVELS >= 4
- if ( lev == 4 )
- {
- l4e = map_domain_page(mfn);
- if ( !(l4e_get_flags(l4e[index]) & _PAGE_PRESENT) )
- {
- HAP_PRINTK("Level 4 entry not present at index = %d\n", index);
- success = 0;
- }
- gpfn = l4e_get_pfn(l4e[index]);
- unmap_domain_page(l4e);
- }
-#endif
+ pfec[0] &= ~PFEC_page_present;
+ return INVALID_GFN;
+ }
-#if GUEST_PAGING_LEVELS >= 3
- if ( lev == 3 )
- {
- l3e = map_domain_page(mfn);
+ /* Map the top-level table and call the tree-walker */
+ ASSERT(mfn_valid(mfn_x(top_mfn)));
+ top_map = map_domain_page(mfn_x(top_mfn));
#if GUEST_PAGING_LEVELS == 3
- index += ((gcr3 >> 5) & 127) * 4;
-#endif
- if ( !(l3e_get_flags(l3e[index]) & _PAGE_PRESENT) )
- {
- HAP_PRINTK("Level 3 entry not present at index = %d\n", index);
- success = 0;
- }
- gpfn = l3e_get_pfn(l3e[index]);
- unmap_domain_page(l3e);
- }
+ top_map += (cr3 & ~(PAGE_MASK | 31));
#endif
+ missing = guest_walk_tables(v, gva, &gw, pfec[0], top_mfn, top_map);
+ unmap_domain_page(top_map);
+
+ /* Interpret the answer */
+ if ( missing == 0 )
+ return gfn_x(guest_l1e_get_gfn(gw.l1e));
+
+ if ( missing & _PAGE_PRESENT )
+ pfec[0] &= ~PFEC_page_present;
+
+ return INVALID_GFN;
+}
- if ( lev == 2 )
- {
- l2e = map_domain_page(mfn);
- if ( !(l2e_get_flags(l2e[index]) & _PAGE_PRESENT) )
- {
- HAP_PRINTK("Level 2 entry not present at index = %d\n", index);
- success = 0;
- }
-
- if ( l2e_get_flags(l2e[index]) & _PAGE_PSE )
- {
- paddr_t mask = ((paddr_t)1 << PT_SHIFT[mode][2]) - 1;
- HAP_PRINTK("guest page table is PSE\n");
- gpa = (l2e_get_intpte(l2e[index]) & ~mask) + (gva & mask);
- unmap_domain_page(l2e);
- break; /* last level page table, jump out from here */
- }
-
- gpfn = l2e_get_pfn(l2e[index]);
- unmap_domain_page(l2e);
- }
-
- if ( lev == 1 )
- {
- l1e = map_domain_page(mfn);
- if ( !(l1e_get_flags(l1e[index]) & _PAGE_PRESENT) )
- {
- HAP_PRINTK("Level 1 entry not present at index = %d\n", index);
- success = 0;
- }
- gpfn = l1e_get_pfn(l1e[index]);
- gpa = (l1e_get_intpte(l1e[index]) & PAGE_MASK) + (gva &~PAGE_MASK);
- unmap_domain_page(l1e);
- }
-
- if ( success != 1 ) /* error happened, jump out */
- break;
- }
-
- gpa &= PADDR_MASK;
- HAP_PRINTK("success = %d, gva = %lx, gpa = %lx\n", success, gva, gpa);
+#else
- return (!success ? INVALID_GFN : ((paddr_t)gpa >> PAGE_SHIFT));
+unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, unsigned long gva, uint32_t *pfec)
+{
+ gdprintk(XENLOG_ERR,
+ "Guest paging level is greater than host paging level!\n");
+ domain_crash(v->domain);
+ return INVALID_GFN;
}
#endif
+
/*
* Local variables:
* mode: C
diff --git a/xen/arch/x86/mm/hap/private.h b/xen/arch/x86/mm/hap/private.h
index 00bed88db4..7b06e7df63 100644
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -20,9 +20,6 @@
#ifndef __HAP_PRIVATE_H__
#define __HAP_PRIVATE_H__
-#include <asm/flushtlb.h>
-#include <asm/hvm/support.h>
-
/********************************************/
/* GUEST TRANSLATION FUNCS */
/********************************************/
@@ -33,36 +30,5 @@ unsigned long hap_gva_to_gfn_3level(struct vcpu *v, unsigned long gva,
unsigned long hap_gva_to_gfn_4level(struct vcpu *v, unsigned long gva,
uint32_t *pfec);
-/********************************************/
-/* MISC DEFINITIONS */
-/********************************************/
-
-/* PT_SHIFT describes the amount by which a virtual address is shifted right
- * to right justify the portion to be used for indexing into a page
- * table, given the guest memory model (i.e. number of levels) and the level
- * of the page table being accessed. The idea is from Virtual Iron's code.
- */
-static const int PT_SHIFT[][5] =
- { /* ------ level ------ nr_levels */
- /* 1 2 3 4 */
- { 0, 0, 0, 0, 0}, /* 0 not used */
- { 0, 0, 0, 0, 0}, /* 1 not used */
- { 0, 12, 22, 0, 0}, /* 2 */
- { 0, 12, 21, 30, 0}, /* 3 */
- { 0, 12, 21, 30, 39} /* 4 */
- };
-
-/* PT_ENTRIES describes the number of entries in a page table, given the
- * memory model (i.e. number of levels) and the level of the page table
- * being considered. This idea from Virtual Iron's shadow code*/
-static const int PT_ENTRIES[][5] =
- { /* ------ level ------ nr_levels */
- /* 1 2 3 4 */
- { 0, 0, 0, 0, 0}, /* 0 not used */
- { 0, 0, 0, 0, 0}, /* 1 not used */
- { 0, 1024, 1024, 0, 0}, /* 2 */
- { 0, 512, 512, 4, 0}, /* 3 */
- { 0, 512, 512, 512, 512} /* 4 */
- };
#endif /* __SVM_NPT_H__ */
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 93f9489559..d32498092e 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -748,7 +748,7 @@ static void audit_p2m(struct domain *d)
if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) )
{
- lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type));
+ lp2mfn = mfn_x(gfn_to_mfn(d, gfn, &type));
if ( lp2mfn != mfn_x(p2mfn) )
{
P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
diff --git a/xen/arch/x86/mm/page-guest32.h b/xen/arch/x86/mm/page-guest32.h
deleted file mode 100644
index 5d333bd91b..0000000000
--- a/xen/arch/x86/mm/page-guest32.h
+++ /dev/null
@@ -1,100 +0,0 @@
-
-#ifndef __X86_PAGE_GUEST_H__
-#define __X86_PAGE_GUEST_H__
-
-#ifndef __ASSEMBLY__
-# include <asm/types.h>
-#endif
-
-#define PAGETABLE_ORDER_32 10
-#define L1_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32)
-#define L2_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32)
-#define ROOT_PAGETABLE_ENTRIES_32 L2_PAGETABLE_ENTRIES_32
-
-
-#define L1_PAGETABLE_SHIFT_32 12
-#define L2_PAGETABLE_SHIFT_32 22
-
-/* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
-
-#ifndef __ASSEMBLY__
-
-typedef u32 intpte_32_t;
-
-typedef struct { intpte_32_t l1; } l1_pgentry_32_t;
-typedef struct { intpte_32_t l2; } l2_pgentry_32_t;
-typedef l2_pgentry_t root_pgentry_32_t;
-#endif
-
-#define get_pte_flags_32(x) ((u32)(x) & 0xFFF)
-#define put_pte_flags_32(x) ((intpte_32_t)(x))
-
-/* Get pte access flags (unsigned int). */
-#define l1e_get_flags_32(x) (get_pte_flags_32((x).l1))
-#define l2e_get_flags_32(x) (get_pte_flags_32((x).l2))
-
-#define l1e_get_paddr_32(x) \
- ((paddr_t)(((x).l1 & (PADDR_MASK&PAGE_MASK))))
-#define l2e_get_paddr_32(x) \
- ((paddr_t)(((x).l2 & (PADDR_MASK&PAGE_MASK))))
-
-/* Construct an empty pte. */
-#define l1e_empty_32() ((l1_pgentry_32_t) { 0 })
-#define l2e_empty_32() ((l2_pgentry_32_t) { 0 })
-
-/* Construct a pte from a pfn and access flags. */
-#define l1e_from_pfn_32(pfn, flags) \
- ((l1_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
-#define l2e_from_pfn_32(pfn, flags) \
- ((l2_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
-
-/* Construct a pte from a physical address and access flags. */
-#ifndef __ASSEMBLY__
-static inline l1_pgentry_32_t l1e_from_paddr_32(paddr_t pa, unsigned int flags)
-{
- ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
- return (l1_pgentry_32_t) { pa | put_pte_flags_32(flags) };
-}
-static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags)
-{
- ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
- return (l2_pgentry_32_t) { pa | put_pte_flags_32(flags) };
-}
-#endif /* !__ASSEMBLY__ */
-
-
-/* Construct a pte from a page pointer and access flags. */
-#define l1e_from_page_32(page, flags) (l1e_from_pfn_32(page_to_mfn(page),(flags)))
-#define l2e_from_page_32(page, flags) (l2e_from_pfn_32(page_to_mfn(page),(flags)))
-
-/* Add extra flags to an existing pte. */
-#define l1e_add_flags_32(x, flags) ((x).l1 |= put_pte_flags_32(flags))
-#define l2e_add_flags_32(x, flags) ((x).l2 |= put_pte_flags_32(flags))
-
-/* Remove flags from an existing pte. */
-#define l1e_remove_flags_32(x, flags) ((x).l1 &= ~put_pte_flags_32(flags))
-#define l2e_remove_flags_32(x, flags) ((x).l2 &= ~put_pte_flags_32(flags))
-
-/* Check if a pte's page mapping or significant access flags have changed. */
-#define l1e_has_changed_32(x,y,flags) \
- ( !!(((x).l1 ^ (y).l1) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
-#define l2e_has_changed_32(x,y,flags) \
- ( !!(((x).l2 ^ (y).l2) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
-
-/* Given a virtual address, get an entry offset into a page table. */
-#define l1_table_offset_32(a) \
- (((a) >> L1_PAGETABLE_SHIFT_32) & (L1_PAGETABLE_ENTRIES_32 - 1))
-#define l2_table_offset_32(a) \
- (((a) >> L2_PAGETABLE_SHIFT_32) & (L2_PAGETABLE_ENTRIES_32 - 1))
-
-#endif /* __X86_PAGE_GUEST_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index f3ac8bfc39..2f2e3bf29d 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -35,6 +35,7 @@
#include <asm/hvm/hvm.h>
#include <asm/hvm/cacheattr.h>
#include <asm/mtrr.h>
+#include <asm/guest_pt.h>
#include "private.h"
#include "types.h"
@@ -156,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
put_page(mfn_to_page(gmfn));
}
-/**************************************************************************/
-/* CPU feature support querying */
-
-static inline int
-guest_supports_superpages(struct vcpu *v)
-{
- /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
- * CR4.PSE is set or the guest is in PAE or long mode.
- * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
- return (is_hvm_vcpu(v) &&
- (GUEST_PAGING_LEVELS != 2
- || !hvm_paging_enabled(v)
- || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
-}
-
-static inline int
-guest_supports_nx(struct vcpu *v)
-{
- if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
- return 0;
- if ( !is_hvm_vcpu(v) )
- return cpu_has_nx;
- return hvm_nx_enabled(v);
-}
-
/**************************************************************************/
/* Functions for walking the guest page tables */
-/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
-static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
-{
- static uint32_t flags[] = {
- /* I/F - Usr Wr */
- /* 0 0 0 0 */ _PAGE_PRESENT,
- /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
- /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
- /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
- /* 0 1 0 0 */ _PAGE_PRESENT,
- /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
- /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
- /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
- /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
- /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
- /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
- /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
- /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
- /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
- /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
- /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
- };
-
- /* Don't demand not-NX if the CPU wouldn't enforce it. */
- if ( !guest_supports_nx(v) )
- pfec &= ~PFEC_insn_fetch;
-
- /* Don't demand R/W if the CPU wouldn't enforce it. */
- if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
- && !(pfec & PFEC_user_mode) )
- pfec &= ~PFEC_write_access;
-
- return flags[(pfec & 0x1f) >> 1];
-}
-
-/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
- * Returns non-zero if it actually writes to guest memory. */
-static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
+static inline uint32_t
+sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
+ uint32_t pfec)
{
- guest_intpte_t old, new;
- int ret = 0;
-
- old = *(guest_intpte_t *)walk_p;
- new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
- if ( old != new )
- {
- /* Write the new entry into the walk, and try to write it back
- * into the guest table as well. If the guest table has changed
- * under out feet then leave it alone. */
- *(guest_intpte_t *)walk_p = new;
- if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
- ret = 1;
-
- /* FIXME -- this code is longer than necessary */
- if(set_dirty)
- TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
- else
- TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
- }
- return ret;
+ return guest_walk_tables(v, va, gw, pfec,
+#if GUEST_PAGING_LEVELS == 3 /* PAE */
+ _mfn(INVALID_MFN),
+ v->arch.paging.shadow.gl3e
+#else /* 32 or 64 */
+ pagetable_get_mfn(v->arch.guest_table),
+ v->arch.paging.shadow.guest_vtable
+#endif
+ );
}
/* This validation is called with lock held, and after write permission
@@ -254,7 +183,7 @@ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
* Return 1 to indicate success and 0 for inconsistency
*/
static inline uint32_t
-shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
+shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
{
struct domain *d = v->domain;
guest_l1e_t *l1p;
@@ -267,9 +196,8 @@ shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
ASSERT(shadow_locked_by_me(d));
- if ( gw->version ==
- atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
- return 1;
+ if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
+ return 1;
/* We may consider caching guest page mapping from last
* guest table walk. However considering this check happens
@@ -364,239 +292,6 @@ gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
return rc;
}
-/* Walk the guest pagetables, after the manner of a hardware walker.
- *
- * Inputs: a vcpu, a virtual address, a walk_t to fill, a
- * pointer to a pagefault code
- *
- * We walk the vcpu's guest pagetables, filling the walk_t with what we
- * see and adding any Accessed and Dirty bits that are needed in the
- * guest entries. Using the pagefault code, we check the permissions as
- * we go. For the purposes of reading pagetables we treat all non-RAM
- * memory as contining zeroes.
- *
- * The walk is done in a lock-free style, with some sanity check postponed
- * after grabbing shadow lock later. Those delayed checks will make sure
- * no inconsistent mapping being translated into shadow page table.
- *
- * Returns 0 for success, or the set of permission bits that we failed on
- * if the walk did not complete.
- * N.B. This is different from the old return code but almost no callers
- * checked the old return code anyway.
- */
-static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
-{
- struct domain *d = v->domain;
- p2m_type_t p2mt;
- guest_l1e_t *l1p = NULL;
- guest_l2e_t *l2p = NULL;
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- guest_l3e_t *l3p = NULL;
- guest_l4e_t *l4p;
-#endif
- uint32_t gflags, mflags, rc = 0;
- int pse;
-
- perfc_incr(shadow_guest_walk);
- memset(gw, 0, sizeof(*gw));
- gw->va = va;
-
- gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
- rmb();
-
- /* Mandatory bits that must be set in every entry. We invert NX, to
- * calculate as if there were an "X" bit that allowed access.
- * We will accumulate, in rc, the set of flags that are missing. */
- mflags = mandatory_flags(v, pfec);
-
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-
- /* Get the l4e from the top level table and check its flags*/
- gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
- l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
- gw->l4e = l4p[guest_l4_table_offset(va)];
- gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- if ( rc & _PAGE_PRESENT ) goto out;
-
- /* Map the l3 table */
- gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
- if ( !p2m_is_ram(p2mt) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
- ASSERT(mfn_valid(gw->l3mfn));
-
- /* Get the l3e and check its flags*/
- l3p = sh_map_domain_page(gw->l3mfn);
- gw->l3e = l3p[guest_l3_table_offset(va)];
- gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- if ( rc & _PAGE_PRESENT )
- goto out;
-
-#else /* PAE only... */
-
- /* Get l3e from the cache of the top level table and check its flag */
- gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
- if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
-
-#endif /* PAE or 64... */
-
- /* Map the l2 table */
- gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
- if ( !p2m_is_ram(p2mt) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
- ASSERT(mfn_valid(gw->l2mfn));
-
- /* Get the l2e */
- l2p = sh_map_domain_page(gw->l2mfn);
- gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#else /* 32-bit only... */
-
- /* Get l2e from the top level table */
- gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
- l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
- gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#endif /* All levels... */
-
- gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- if ( rc & _PAGE_PRESENT )
- goto out;
-
- pse = (guest_supports_superpages(v) &&
- (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
-
- if ( pse )
- {
- /* Special case: this guest VA is in a PSE superpage, so there's
- * no guest l1e. We make one up so that the propagation code
- * can generate a shadow l1 table. Start with the gfn of the
- * first 4k-page of the superpage. */
- gfn_t start = guest_l2e_get_gfn(gw->l2e);
- /* Grant full access in the l1e, since all the guest entry's
- * access controls are enforced in the shadow l2e. */
- int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
- _PAGE_ACCESSED|_PAGE_DIRTY);
- /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
- * of the level 1. */
- if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
- flags |= _PAGE_PAT;
- /* Copy the cache-control bits to the l1 as well, because we
- * can't represent PAT in the (non-PSE) shadow l2e. :(
- * This could cause problems if a guest ever maps an area of
- * memory with superpages using more than one caching mode. */
- flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
- /* Increment the pfn by the right number of 4k pages.
- * The ~0x1 is to mask out the PAT bit mentioned above. */
- start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
- gw->l1e = guest_l1e_from_gfn(start, flags);
- gw->l1mfn = _mfn(INVALID_MFN);
- }
- else
- {
- /* Not a superpage: carry on and find the l1e. */
- gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
- if ( !p2m_is_ram(p2mt) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
- ASSERT(mfn_valid(gw->l1mfn));
- l1p = sh_map_domain_page(gw->l1mfn);
- gw->l1e = l1p[guest_l1_table_offset(va)];
- gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- }
-
- /* Go back and set accessed and dirty bits only if the walk was a
- * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
- * get set whenever a lower-level PT is used, at least some hardware
- * walkers behave this way. */
- if ( rc == 0 )
- {
-#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
- if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
- paging_mark_dirty(d, mfn_x(gw->l4mfn));
- if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
- paging_mark_dirty(d, mfn_x(gw->l3mfn));
-#endif
- if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
- (pse && (pfec & PFEC_write_access))) )
- paging_mark_dirty(d, mfn_x(gw->l2mfn));
- if ( !pse )
- {
- if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
- (pfec & PFEC_write_access)) )
- paging_mark_dirty(d, mfn_x(gw->l1mfn));
- }
- }
-
- out:
-#if GUEST_PAGING_LEVELS == 4
- if ( l3p ) sh_unmap_domain_page(l3p);
-#endif
-#if GUEST_PAGING_LEVELS >= 3
- if ( l2p ) sh_unmap_domain_page(l2p);
-#endif
- if ( l1p ) sh_unmap_domain_page(l1p);
-
- return rc;
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding frame number. */
-static inline gfn_t
-guest_walk_to_gfn(walk_t *gw)
-{
- if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
- return _gfn(INVALID_GFN);
- return guest_l1e_get_gfn(gw->l1e);
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding physical address. */
-static inline paddr_t
-guest_walk_to_gpa(walk_t *gw)
-{
- if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
- return 0;
- return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
-}
-
-#if 0 /* Keep for debugging */
-/* Pretty-print the contents of a guest-walk */
-static inline void print_gw(walk_t *gw)
-{
- SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
- SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
- SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
-#endif /* PAE or 64... */
- SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
-#endif /* All levels... */
- SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
- SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
- SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
- SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
-}
-#endif /* 0 */
-
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
/* Lightweight audit: pass all the shadows associated with this guest walk
* through the audit mechanisms */
@@ -657,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
// XXX -- this is expensive, but it's easy to cobble together...
// FIXME!
- if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
+ if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
&& mfn_valid(gw.l1mfn) )
{
if ( gl1mfn )
@@ -679,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
// XXX -- this is expensive, but it's easy to cobble together...
// FIXME!
- (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
+ (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
*(guest_l1e_t *)eff_l1e = gw.l1e;
}
#endif /* CONFIG == GUEST (== SHADOW) */
@@ -1171,9 +866,6 @@ static int shadow_set_l4e(struct vcpu *v,
domain_crash(v->domain);
return SHADOW_SET_ERROR;
}
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
- shadow_resync_all(v, 0);
-#endif
}
/* Write the new entry */
@@ -1219,9 +911,6 @@ static int shadow_set_l3e(struct vcpu *v,
domain_crash(v->domain);
return SHADOW_SET_ERROR;
}
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
- shadow_resync_all(v, 0);
-#endif
}
/* Write the new entry */
@@ -2021,7 +1710,8 @@ static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
walk_t *gw,
mfn_t *sl3mfn,
- fetch_type_t ft)
+ fetch_type_t ft,
+ int *resync)
{
mfn_t sl4mfn;
shadow_l4e_t *sl4e;
@@ -2051,6 +1741,11 @@ static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
ASSERT((r & SHADOW_SET_FLUSH) == 0);
if ( r & SHADOW_SET_ERROR )
return NULL;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+ *resync |= 1;
+#endif
+
}
/* Now follow it down a level. Guaranteed to succeed. */
return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
@@ -2061,14 +1756,15 @@ static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
walk_t *gw,
mfn_t *sl2mfn,
- fetch_type_t ft)
+ fetch_type_t ft,
+ int *resync)
{
#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
mfn_t sl3mfn = _mfn(INVALID_MFN);
shadow_l3e_t *sl3e;
if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
/* Get the l3e */
- sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
+ sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
if ( sl3e == NULL ) return NULL;
if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
{
@@ -2100,6 +1796,11 @@ static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
ASSERT((r & SHADOW_SET_FLUSH) == 0);
if ( r & SHADOW_SET_ERROR )
return NULL;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+ *resync |= 1;
+#endif
+
}
/* Now follow it down a level. Guaranteed to succeed. */
return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
@@ -2132,11 +1833,13 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
fetch_type_t ft)
{
mfn_t sl2mfn;
+ int resync = 0;
shadow_l2e_t *sl2e;
/* Get the l2e */
- sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
+ sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync);
if ( sl2e == NULL ) return NULL;
+
/* Install the sl1 in the l2e if it wasn't there or if we need to
* re-do it to fix a PSE dirty bit. */
if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT
@@ -2182,6 +1885,14 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
ASSERT((r & SHADOW_SET_FLUSH) == 0);
if ( r & SHADOW_SET_ERROR )
return NULL;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+ /* All pages walked are now pagetables. Safe to resync pages
+ in case level 4 or 3 shadows were set. */
+ if ( resync )
+ shadow_resync_all(v, 0);
+#endif
+
/* This next line is important: in 32-on-PAE and 32-on-64 modes,
* the guest l1 table has an 8k shadow, and we need to return
* the right mfn of the pair. This call will set it for us as a
@@ -2463,6 +2174,10 @@ static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
else
result |= SHADOW_SET_ERROR;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+ shadow_resync_all(v, 0);
+#endif
}
l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
@@ -2515,6 +2230,10 @@ static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
else
result |= SHADOW_SET_ERROR;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+ shadow_resync_all(v, 0);
+#endif
}
l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
@@ -3173,6 +2892,7 @@ static int sh_page_fault(struct vcpu *v,
fetch_type_t ft = 0;
p2m_type_t p2mt;
uint32_t rc;
+ int version;
#if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
int fast_emul = 0;
#endif
@@ -3316,7 +3036,14 @@ static int sh_page_fault(struct vcpu *v,
}
rewalk:
- rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+ /* The walk is done in a lock-free style, with some sanity check
+ * postponed after grabbing shadow lock later. Those delayed checks
+ * will make sure no inconsistent mapping being translated into
+ * shadow page table. */
+ version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
+ rmb();
+ rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
regs->error_code &= ~PFEC_page_present;
@@ -3392,7 +3119,7 @@ static int sh_page_fault(struct vcpu *v,
}
#endif /* OOS */
- if ( !shadow_check_gwalk(v, va, &gw) )
+ if ( !shadow_check_gwalk(v, va, &gw, version) )
{
perfc_incr(shadow_inconsistent_gwalk);
shadow_unlock(d);
@@ -3869,7 +3596,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
return vtlb_gfn;
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
- if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
+ if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
{
if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
pfec[0] &= ~PFEC_page_present;
diff --git a/xen/arch/x86/mm/shadow/types.h b/xen/arch/x86/mm/shadow/types.h
index 440d2d31fb..f9f0554c47 100644
--- a/xen/arch/x86/mm/shadow/types.h
+++ b/xen/arch/x86/mm/shadow/types.h
@@ -191,169 +191,13 @@ static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags)
})
#endif
-
-/* Type of the guest's frame numbers */
-TYPE_SAFE(unsigned long,gfn)
-#define SH_PRI_gfn "05lx"
-
-#define VALID_GFN(m) (m != INVALID_GFN)
-
-static inline int
-valid_gfn(gfn_t m)
-{
- return VALID_GFN(gfn_x(m));
-}
-
-static inline paddr_t
-gfn_to_paddr(gfn_t gfn)
-{
- return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
-}
-
-/* Override gfn_to_mfn to work with gfn_t */
-#undef gfn_to_mfn
-#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), gfn_x(g), (t))
+/* The shadow types needed for the various levels. */
#if GUEST_PAGING_LEVELS == 2
-
-#include "../page-guest32.h"
-
-#define GUEST_L1_PAGETABLE_ENTRIES 1024
-#define GUEST_L2_PAGETABLE_ENTRIES 1024
-#define GUEST_L1_PAGETABLE_SHIFT 12
-#define GUEST_L2_PAGETABLE_SHIFT 22
-
-/* Types of the guest's page tables */
-typedef l1_pgentry_32_t guest_l1e_t;
-typedef l2_pgentry_32_t guest_l2e_t;
-typedef intpte_32_t guest_intpte_t;
-
-/* Access functions for them */
-static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
-{ return l1e_get_paddr_32(gl1e); }
-static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
-{ return l2e_get_paddr_32(gl2e); }
-
-static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
-{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
-{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); }
-
-static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
-{ return l1e_get_flags_32(gl1e); }
-static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
-{ return l2e_get_flags_32(gl2e); }
-
-static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
-{ l1e_add_flags_32(gl1e, flags); return gl1e; }
-static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
-{ l2e_add_flags_32(gl2e, flags); return gl2e; }
-
-static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
-{ return l1e_from_pfn_32(gfn_x(gfn), flags); }
-static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
-{ return l2e_from_pfn_32(gfn_x(gfn), flags); }
-
-#define guest_l1_table_offset(a) l1_table_offset_32(a)
-#define guest_l2_table_offset(a) l2_table_offset_32(a)
-
-/* The shadow types needed for the various levels. */
#define SH_type_l1_shadow SH_type_l1_32_shadow
#define SH_type_l2_shadow SH_type_l2_32_shadow
#define SH_type_fl1_shadow SH_type_fl1_32_shadow
-
-#else /* GUEST_PAGING_LEVELS != 2 */
-
-#if GUEST_PAGING_LEVELS == 3
-#define GUEST_L1_PAGETABLE_ENTRIES 512
-#define GUEST_L2_PAGETABLE_ENTRIES 512
-#define GUEST_L3_PAGETABLE_ENTRIES 4
-#define GUEST_L1_PAGETABLE_SHIFT 12
-#define GUEST_L2_PAGETABLE_SHIFT 21
-#define GUEST_L3_PAGETABLE_SHIFT 30
-#else /* GUEST_PAGING_LEVELS == 4 */
-#define GUEST_L1_PAGETABLE_ENTRIES 512
-#define GUEST_L2_PAGETABLE_ENTRIES 512
-#define GUEST_L3_PAGETABLE_ENTRIES 512
-#define GUEST_L4_PAGETABLE_ENTRIES 512
-#define GUEST_L1_PAGETABLE_SHIFT 12
-#define GUEST_L2_PAGETABLE_SHIFT 21
-#define GUEST_L3_PAGETABLE_SHIFT 30
-#define GUEST_L4_PAGETABLE_SHIFT 39
-#endif
-
-/* Types of the guest's page tables */
-typedef l1_pgentry_t guest_l1e_t;
-typedef l2_pgentry_t guest_l2e_t;
-typedef l3_pgentry_t guest_l3e_t;
-#if GUEST_PAGING_LEVELS >= 4
-typedef l4_pgentry_t guest_l4e_t;
-#endif
-typedef intpte_t guest_intpte_t;
-
-/* Access functions for them */
-static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
-{ return l1e_get_paddr(gl1e); }
-static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
-{ return l2e_get_paddr(gl2e); }
-static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
-{ return l3e_get_paddr(gl3e); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
-{ return l4e_get_paddr(gl4e); }
-#endif
-
-static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
-{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
-{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
-{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
-{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
-#endif
-
-static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
-{ return l1e_get_flags(gl1e); }
-static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
-{ return l2e_get_flags(gl2e); }
-static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
-{ return l3e_get_flags(gl3e); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
-{ return l4e_get_flags(gl4e); }
-#endif
-
-static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
-{ l1e_add_flags(gl1e, flags); return gl1e; }
-static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
-{ l2e_add_flags(gl2e, flags); return gl2e; }
-static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags)
-{ l3e_add_flags(gl3e, flags); return gl3e; }
-#if GUEST_PAGING_LEVELS >= 4
-static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags)
-{ l4e_add_flags(gl4e, flags); return gl4e; }
-#endif
-
-static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
-{ return l1e_from_pfn(gfn_x(gfn), flags); }
-static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
-{ return l2e_from_pfn(gfn_x(gfn), flags); }
-static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
-{ return l3e_from_pfn(gfn_x(gfn), flags); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
-{ return l4e_from_pfn(gfn_x(gfn), flags); }
-#endif
-
-#define guest_l1_table_offset(a) l1_table_offset(a)
-#define guest_l2_table_offset(a) l2_table_offset(a)
-#define guest_l3_table_offset(a) l3_table_offset(a)
-#define guest_l4_table_offset(a) l4_table_offset(a)
-
-/* The shadow types needed for the various levels. */
-#if GUEST_PAGING_LEVELS == 3
+#elif GUEST_PAGING_LEVELS == 3
#define SH_type_l1_shadow SH_type_l1_pae_shadow
#define SH_type_fl1_shadow SH_type_fl1_pae_shadow
#define SH_type_l2_shadow SH_type_l2_pae_shadow
@@ -367,35 +211,6 @@ static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
#define SH_type_l4_shadow SH_type_l4_64_shadow
#endif
-#endif /* GUEST_PAGING_LEVELS != 2 */
-
-
-/* Type used for recording a walk through guest pagetables. It is
- * filled in by the pagetable walk function, and also used as a cache
- * for later walks. When we encounter a suporpage l2e, we fabricate an
- * l1e for propagation to the shadow (for splintering guest superpages
- * into many shadow l1 entries). */
-typedef struct shadow_walk_t walk_t;
-struct shadow_walk_t
-{
- unsigned long va; /* Address we were looking for */
-#if GUEST_PAGING_LEVELS >= 3
-#if GUEST_PAGING_LEVELS >= 4
- guest_l4e_t l4e; /* Guest's level 4 entry */
-#endif
- guest_l3e_t l3e; /* Guest's level 3 entry */
-#endif
- guest_l2e_t l2e; /* Guest's level 2 entry */
- guest_l1e_t l1e; /* Guest's level 1 entry (or fabrication) */
-#if GUEST_PAGING_LEVELS >= 4
- mfn_t l4mfn; /* MFN that the level 4 entry was in */
- mfn_t l3mfn; /* MFN that the level 3 entry was in */
-#endif
- mfn_t l2mfn; /* MFN that the level 2 entry was in */
- mfn_t l1mfn; /* MFN that the level 1 entry was in */
- int version; /* Saved guest dirty version */
-};
-
/* macros for dealing with the naming of the internal function names of the
* shadow code's external entry points.
*/
@@ -460,17 +275,9 @@ struct shadow_walk_t
#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20)
#endif
-#define SH_PRI_pte PRIpte
-
-#if GUEST_PAGING_LEVELS == 2
-#define SH_PRI_gpte "08x"
-#else /* GUEST_PAGING_LEVELS >= 3 */
-#ifndef __x86_64__
-#define SH_PRI_gpte "016llx"
-#else
-#define SH_PRI_gpte "016lx"
-#endif
-#endif /* GUEST_PAGING_LEVELS >= 3 */
+#define SH_PRI_pte PRIpte
+#define SH_PRI_gpte PRI_gpte
+#define SH_PRI_gfn PRI_gfn
#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 8bf5b4a8e3..05a3e47d90 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -212,9 +212,9 @@ static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
entry->msg = *msg;
}
-void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+void set_msi_affinity(unsigned int vector, cpumask_t mask)
{
- struct msi_desc *desc = irq_desc[irq].msi_desc;
+ struct msi_desc *desc = irq_desc[vector].msi_desc;
struct msi_msg msg;
unsigned int dest;
@@ -227,7 +227,7 @@ void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
if ( !desc )
return;
- ASSERT(spin_is_locked(&irq_desc[irq].lock));
+ ASSERT(spin_is_locked(&irq_desc[vector].lock));
spin_lock(&desc->dev->lock);
read_msi_msg(desc, &msg);
@@ -276,9 +276,9 @@ static void msix_set_enable(struct pci_dev *dev, int enable)
}
}
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(unsigned int vector)
{
- struct msi_desc *entry = irq_desc[irq].msi_desc;
+ struct msi_desc *entry = irq_desc[vector].msi_desc;
BUG_ON(!entry || !entry->dev);
switch (entry->msi_attrib.type) {
@@ -298,11 +298,18 @@ static void msix_flush_writes(unsigned int irq)
}
}
-static void msi_set_mask_bit(unsigned int irq, int flag)
+int msi_maskable_irq(const struct msi_desc *entry)
{
- struct msi_desc *entry = irq_desc[irq].msi_desc;
+ BUG_ON(!entry);
+ return entry->msi_attrib.type != PCI_CAP_ID_MSI
+ || entry->msi_attrib.maskbit;
+}
- ASSERT(spin_is_locked(&irq_desc[irq].lock));
+static void msi_set_mask_bit(unsigned int vector, int flag)
+{
+ struct msi_desc *entry = irq_desc[vector].msi_desc;
+
+ ASSERT(spin_is_locked(&irq_desc[vector].lock));
BUG_ON(!entry || !entry->dev);
switch (entry->msi_attrib.type) {
case PCI_CAP_ID_MSI:
@@ -318,8 +325,6 @@ static void msi_set_mask_bit(unsigned int irq, int flag)
mask_bits &= ~(1);
mask_bits |= flag;
pci_conf_write32(bus, slot, func, pos, mask_bits);
- } else {
- msi_set_enable(entry->dev, !flag);
}
break;
case PCI_CAP_ID_MSIX:
@@ -337,16 +342,16 @@ static void msi_set_mask_bit(unsigned int irq, int flag)
entry->msi_attrib.masked = !!flag;
}
-void mask_msi_irq(unsigned int irq)
+void mask_msi_vector(unsigned int vector)
{
- msi_set_mask_bit(irq, 1);
- msix_flush_writes(irq);
+ msi_set_mask_bit(vector, 1);
+ msix_flush_writes(vector);
}
-void unmask_msi_irq(unsigned int irq)
+void unmask_msi_vector(unsigned int vector)
{
- msi_set_mask_bit(irq, 0);
- msix_flush_writes(irq);
+ msi_set_mask_bit(vector, 0);
+ msix_flush_writes(vector);
}
static struct msi_desc* alloc_msi_entry(void)
@@ -649,7 +654,7 @@ static int __pci_enable_msix(struct msi_info *msi)
pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
nr_entries = multi_msix_capable(control);
- if (msi->entry_nr > nr_entries)
+ if (msi->entry_nr >= nr_entries)
{
spin_unlock(&pdev->lock);
return -EINVAL;
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 4bdae0fcd1..833f22e5ba 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -230,7 +230,6 @@ static void __init percpu_init_areas(void)
static void __init init_idle_domain(void)
{
struct domain *idle_domain;
- unsigned int i;
/* Domain creation requires that scheduler structures are initialised. */
scheduler_init();
@@ -243,12 +242,6 @@ static void __init init_idle_domain(void)
idle_vcpu[0] = this_cpu(curr_vcpu) = current;
setup_idle_pagetable();
-
- for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
- idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] =
- l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i,
- __PAGE_HYPERVISOR);
-
}
static void __init srat_detect_node(int cpu)
@@ -456,6 +449,7 @@ void __init __start_xen(unsigned long mbi_p)
parse_video_info();
set_current((struct vcpu *)0xfffff000); /* debug sanity */
+ idle_vcpu[0] = current;
set_processor_id(0); /* needed early, for smp_processor_id() */
if ( cpu_has_efer )
rdmsrl(MSR_EFER, this_cpu(efer));
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 73c115d44a..2382fc3da7 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -821,7 +821,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
*/
{
unsigned long boot_error;
- unsigned int i;
+ unsigned int order;
int timeout;
unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0;
@@ -857,21 +857,21 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
gdt = per_cpu(gdt_table, cpu);
if (gdt == boot_cpu_gdt_table) {
- i = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+ order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
#ifdef __x86_64__
#ifdef CONFIG_COMPAT
- page = alloc_domheap_pages(NULL, i,
+ page = alloc_domheap_pages(NULL, order,
MEMF_node(cpu_to_node(cpu)));
per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
memcpy(gdt, boot_cpu_compat_gdt_table,
NR_RESERVED_GDT_PAGES * PAGE_SIZE);
gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
#endif
- page = alloc_domheap_pages(NULL, i,
+ page = alloc_domheap_pages(NULL, order,
MEMF_node(cpu_to_node(cpu)));
per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
#else
- per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i);
+ per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order);
#endif
memcpy(gdt, boot_cpu_gdt_table,
NR_RESERVED_GDT_PAGES * PAGE_SIZE);
@@ -879,13 +879,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
}
- for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
- v->domain->arch.mm_perdomain_pt
- [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) +
- FIRST_RESERVED_GDT_PAGE + i]
- = l1e_from_page(virt_to_page(gdt) + i,
- __PAGE_HYPERVISOR);
-
#ifdef __i386__
if (!per_cpu(doublefault_tss, cpu)) {
per_cpu(doublefault_tss, cpu) = alloc_xenheap_page();
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 8e86e7180c..037811fde8 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -56,9 +56,12 @@ struct cpu_time {
};
struct platform_timesource {
+ char *id;
char *name;
u64 frequency;
u64 (*read_counter)(void);
+ int (*init)(struct platform_timesource *);
+ void (*resume)(struct platform_timesource *);
int counter_bits;
};
@@ -360,15 +363,22 @@ static u64 read_pit_count(void)
return count32;
}
-static void init_pit(struct platform_timesource *pts)
+static int init_pit(struct platform_timesource *pts)
{
- pts->name = "PIT";
- pts->frequency = CLOCK_TICK_RATE;
- pts->read_counter = read_pit_count;
- pts->counter_bits = 32;
using_pit = 1;
+ return 1;
}
+static struct platform_timesource plt_pit =
+{
+ .id = "pit",
+ .name = "PIT",
+ .frequency = CLOCK_TICK_RATE,
+ .read_counter = read_pit_count,
+ .counter_bits = 32,
+ .init = init_pit
+};
+
/************************************************************
* PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
*/
@@ -385,14 +395,28 @@ static int init_hpet(struct platform_timesource *pts)
if ( hpet_rate == 0 )
return 0;
- pts->name = "HPET";
pts->frequency = hpet_rate;
- pts->read_counter = read_hpet_count;
- pts->counter_bits = 32;
-
return 1;
}
+static void resume_hpet(struct platform_timesource *pts)
+{
+ u64 hpet_rate = hpet_setup();
+
+ BUG_ON(hpet_rate == 0);
+ pts->frequency = hpet_rate;
+}
+
+static struct platform_timesource plt_hpet =
+{
+ .id = "hpet",
+ .name = "HPET",
+ .read_counter = read_hpet_count,
+ .counter_bits = 32,
+ .init = init_hpet,
+ .resume = resume_hpet
+};
+
/************************************************************
* PLATFORM TIMER 3: IBM 'CYCLONE' TIMER
*/
@@ -440,20 +464,24 @@ static int init_cyclone(struct platform_timesource *pts)
printk(KERN_ERR "Cyclone: Could not find valid CBAR value.\n");
return 0;
}
-
+
/* Enable timer and map the counter register. */
*(map_cyclone_reg(base + CYCLONE_PMCC_OFFSET)) = 1;
*(map_cyclone_reg(base + CYCLONE_MPCS_OFFSET)) = 1;
cyclone_timer = map_cyclone_reg(base + CYCLONE_MPMC_OFFSET);
-
- pts->name = "IBM Cyclone";
- pts->frequency = CYCLONE_TIMER_FREQ;
- pts->read_counter = read_cyclone_count;
- pts->counter_bits = 32;
-
return 1;
}
+static struct platform_timesource plt_cyclone =
+{
+ .id = "cyclone",
+ .name = "IBM Cyclone",
+ .frequency = CYCLONE_TIMER_FREQ,
+ .read_counter = read_cyclone_count,
+ .counter_bits = 32,
+ .init = init_cyclone
+};
+
/************************************************************
* PLATFORM TIMER 4: ACPI PM TIMER
*/
@@ -473,14 +501,19 @@ static int init_pmtimer(struct platform_timesource *pts)
if ( pmtmr_ioport == 0 )
return 0;
- pts->name = "ACPI PM Timer";
- pts->frequency = ACPI_PM_FREQUENCY;
- pts->read_counter = read_pmtimer_count;
- pts->counter_bits = 24;
-
return 1;
}
+static struct platform_timesource plt_pmtimer =
+{
+ .id = "acpi",
+ .name = "ACPI PM Timer",
+ .frequency = ACPI_PM_FREQUENCY,
+ .read_counter = read_pmtimer_count,
+ .counter_bits = 24,
+ .init = init_pmtimer
+};
+
/************************************************************
* GENERIC PLATFORM TIMER INFRASTRUCTURE
*/
@@ -548,26 +581,34 @@ static void platform_time_calibration(void)
static void resume_platform_timer(void)
{
- /* No change in platform_stime across suspend/resume. */
- platform_timer_stamp = plt_stamp64;
+ /* Timer source can be reset when backing from S3 to S0 */
+ if ( plt_src.resume )
+ plt_src.resume(&plt_src);
+
+ plt_stamp64 = platform_timer_stamp;
plt_stamp = plt_src.read_counter();
}
static void init_platform_timer(void)
{
- struct platform_timesource *pts = &plt_src;
- int rc = -1;
+ static struct platform_timesource * const plt_timers[] = {
+ &plt_cyclone, &plt_hpet, &plt_pmtimer, &plt_pit
+ };
+
+ struct platform_timesource *pts = NULL;
+ int i, rc = -1;
if ( opt_clocksource[0] != '\0' )
{
- if ( !strcmp(opt_clocksource, "pit") )
- rc = (init_pit(pts), 1);
- else if ( !strcmp(opt_clocksource, "hpet") )
- rc = init_hpet(pts);
- else if ( !strcmp(opt_clocksource, "cyclone") )
- rc = init_cyclone(pts);
- else if ( !strcmp(opt_clocksource, "acpi") )
- rc = init_pmtimer(pts);
+ for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
+ {
+ pts = plt_timers[i];
+ if ( !strcmp(opt_clocksource, pts->id) )
+ {
+ rc = pts->init(pts);
+ break;
+ }
+ }
if ( rc <= 0 )
printk("WARNING: %s clocksource '%s'.\n",
@@ -575,11 +616,17 @@ static void init_platform_timer(void)
opt_clocksource);
}
- if ( (rc <= 0) &&
- !init_cyclone(pts) &&
- !init_hpet(pts) &&
- !init_pmtimer(pts) )
- init_pit(pts);
+ if ( rc <= 0 )
+ {
+ for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
+ {
+ pts = plt_timers[i];
+ if ( (rc = pts->init(pts)) > 0 )
+ break;
+ }
+ }
+
+ BUG_ON(rc <= 0);
plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
@@ -588,6 +635,7 @@ static void init_platform_timer(void)
plt_overflow_period = scale_delta(
1ull << (pts->counter_bits-1), &plt_scale);
init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
+ plt_src = *pts;
plt_overflow(NULL);
platform_timer_stamp = plt_stamp64;
@@ -1172,6 +1220,9 @@ int time_suspend(void)
cmos_utc_offset = -get_cmos_time();
cmos_utc_offset += (wc_sec + (wc_nsec + NOW()) / 1000000000ULL);
kill_timer(&calibration_timer);
+
+ /* Sync platform timer stamps. */
+ platform_time_calibration();
}
/* Better to cancel calibration timer for accuracy. */
@@ -1184,19 +1235,18 @@ int time_resume(void)
{
/*u64 tmp = */init_pit_and_calibrate_tsc();
- disable_pit_irq();
-
/* Disable this while calibrate_tsc_ap() also is skipped. */
/*set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);*/
resume_platform_timer();
+ disable_pit_irq();
+
init_percpu_time();
do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW());
- if ( !is_idle_vcpu(current) )
- update_vcpu_system_time(current);
+ update_vcpu_system_time(current);
return 0;
}
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index c4e9d30597..a315ea1c8e 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -723,7 +723,8 @@ static void pv_cpuid(struct cpu_user_regs *regs)
{
/* Modify Feature Information. */
__clear_bit(X86_FEATURE_VME, &d);
- __clear_bit(X86_FEATURE_PSE, &d);
+ if ( !opt_allow_hugepage )
+ __clear_bit(X86_FEATURE_PSE, &d);
__clear_bit(X86_FEATURE_PGE, &d);
__clear_bit(X86_FEATURE_MCE, &d);
__clear_bit(X86_FEATURE_MCA, &d);
@@ -754,6 +755,7 @@ static void pv_cpuid(struct cpu_user_regs *regs)
__clear_bit(X86_FEATURE_XTPR % 32, &c);
__clear_bit(X86_FEATURE_PDCM % 32, &c);
__clear_bit(X86_FEATURE_DCA % 32, &c);
+ __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
break;
case 0x80000001:
/* Modify Feature Information. */
@@ -2003,9 +2005,12 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
case 4: /* Read CR4 */
/*
* Guests can read CR4 to see what features Xen has enabled. We
- * therefore lie about PGE & PSE as they are unavailable to guests.
+ * therefore lie about PGE as it is unavailable to guests.
+ * Also disallow PSE if hugepages are not enabled.
*/
- *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
+ *reg = read_cr4() & ~X86_CR4_PGE;
+ if ( !opt_allow_hugepage )
+ *reg &= ~X86_CR4_PSE;
break;
default:
diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c
index ea3c18ca88..d19f04f9f0 100644
--- a/xen/arch/x86/x86_32/mm.c
+++ b/xen/arch/x86/x86_32/mm.c
@@ -132,30 +132,6 @@ void __init setup_idle_pagetable(void)
__PAGE_HYPERVISOR));
}
-unsigned long clone_idle_pagetable(struct vcpu *v)
-{
- unsigned int i;
- struct domain *d = v->domain;
- l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0];
- l2_pgentry_t *l2_table = alloc_xenheap_page();
-
- if ( !l2_table )
- return 0;
-
- memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table));
- l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] =
- l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT);
-
- copy_page(l2_table, idle_pg_table_l2 +
- l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES);
- for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
- l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
- l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
- __PAGE_HYPERVISOR);
-
- return __pa(l3_table);
-}
-
void __init zap_low_mappings(l2_pgentry_t *dom0_l2)
{
int i;
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 49ee4565e0..bc2302b6a3 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -21,7 +21,6 @@
#include <xen/lib.h>
#include <xen/init.h>
#include <xen/mm.h>
-#include <xen/numa.h>
#include <xen/sched.h>
#include <xen/guest_access.h>
#include <asm/current.h>
@@ -207,24 +206,6 @@ void __init setup_idle_pagetable(void)
__PAGE_HYPERVISOR));
}
-unsigned long clone_idle_pagetable(struct vcpu *v)
-{
- struct domain *d = v->domain;
- struct page_info *page = alloc_domheap_page(NULL,
- MEMF_node(vcpu_to_node(v)));
- l4_pgentry_t *l4_table = page_to_virt(page);
-
- if ( !page )
- return 0;
-
- copy_page(l4_table, idle_pg_table);
- l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
- l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
- __PAGE_HYPERVISOR);
-
- return __pa(l4_table);
-}
-
void __init zap_low_mappings(void)
{
BUG_ON(num_online_cpus() != 1);
diff --git a/xen/common/kernel.c b/xen/common/kernel.c
index f3af91f2ed..9137671817 100644
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -222,7 +222,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE(void) arg)
#ifdef CONFIG_X86
if ( !is_hvm_vcpu(current) )
fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
- (1U << XENFEAT_highmem_assist);
+ (1U << XENFEAT_highmem_assist) |
+ (1U << XENFEAT_gnttab_map_avail_bits);
#endif
break;
default:
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 4229e2355f..fa41b210e0 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -62,7 +62,7 @@ int pt_irq_create_bind_vtd(
struct dev_intx_gsi_link *digl;
int pirq = pt_irq_bind->machine_irq;
- if ( pirq < 0 || pirq >= NR_PIRQS )
+ if ( pirq < 0 || pirq >= NR_IRQS )
return -EINVAL;
spin_lock(&d->event_lock);
@@ -261,7 +261,7 @@ void hvm_dpci_msi_eoi(struct domain *d, int vector)
spin_lock(&d->event_lock);
pirq = hvm_irq_dpci->msi_gvec_pirq[vector];
- if ( ( pirq >= 0 ) && (pirq < NR_PIRQS) &&
+ if ( ( pirq >= 0 ) && (pirq < NR_IRQS) &&
test_bit(pirq, hvm_irq_dpci->mapping) &&
(test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags)))
{
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 72c123ada4..f75a6132d7 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -171,9 +171,9 @@ static void pci_clean_dpci_irqs(struct domain *d)
hvm_irq_dpci = domain_get_irq_dpci(d);
if ( hvm_irq_dpci != NULL )
{
- for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_PIRQS);
- i < NR_PIRQS;
- i = find_next_bit(hvm_irq_dpci->mapping, NR_PIRQS, i + 1) )
+ for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_IRQS);
+ i < NR_IRQS;
+ i = find_next_bit(hvm_irq_dpci->mapping, NR_IRQS, i + 1) )
{
pirq_guest_unbind(d, i);
kill_timer(&hvm_irq_dpci->hvm_timer[irq_to_vector(i)]);
diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c
index 43107b3ae3..93531c65a2 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -351,7 +351,9 @@ acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header)
if ( rmrr->base_address >= rmrr->end_address )
{
- dprintk(XENLOG_ERR VTDPREFIX, "RMRR is incorrect.\n");
+ dprintk(XENLOG_ERR VTDPREFIX,
+ "RMRR error: base_addr %"PRIx64" end_address %"PRIx64"\n",
+ rmrr->base_address, rmrr->end_address);
return -EFAULT;
}
diff --git a/xen/drivers/passthrough/vtd/ia64/vtd.c b/xen/drivers/passthrough/vtd/ia64/vtd.c
index 42e94f7ef5..b0abcf6929 100644
--- a/xen/drivers/passthrough/vtd/ia64/vtd.c
+++ b/xen/drivers/passthrough/vtd/ia64/vtd.c
@@ -21,6 +21,7 @@
#include <xen/sched.h>
#include <xen/domain_page.h>
#include <xen/iommu.h>
+#include <xen/numa.h>
#include <asm/xensystem.h>
#include <asm/sal.h>
#include "../iommu.h"
@@ -44,12 +45,12 @@ void unmap_vtd_domain_page(void *va)
}
/* Allocate page table, return its machine address */
-u64 alloc_pgtable_maddr(void)
+u64 alloc_pgtable_maddr(struct domain *d)
{
struct page_info *pg;
u64 *vaddr;
- pg = alloc_domheap_page(NULL, 0);
+ pg = alloc_domheap_page(NULL, d ? MEMF_node(domain_to_node(d)) : 0);
vaddr = map_domain_page(page_to_mfn(pg));
if ( !vaddr )
return 0;
diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c
index 059ebf5a24..c9a73f50c4 100644
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -30,6 +30,10 @@
#include "vtd.h"
#include "extern.h"
+#ifndef dest_SMI
+#define dest_SMI -1
+#endif
+
u16 apicid_to_bdf(int apic_id)
{
struct acpi_drhd_unit *drhd = ioapic_to_drhd(apic_id);
@@ -207,7 +211,7 @@ unsigned int io_apic_read_remap_rte(
remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;
- if ( remap_rte->format == 0 )
+ if ( (remap_rte->format == 0) || (old_rte.delivery_mode == dest_SMI) )
{
*IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
return *(IO_APIC_BASE(apic)+4);
@@ -253,6 +257,31 @@ void io_apic_write_remap_rte(
remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;
+ if ( old_rte.delivery_mode == dest_SMI )
+ {
+ /* Some BIOS does not zero out reserve fields in IOAPIC
+ * RTE's. clear_IO_APIC() zeroes out all RTE's except for RTE
+ * with MSI delivery type. This is a problem when the host
+ * OS converts SMI delivery type to some other type but leaving
+ * the reserved field uninitialized. This can cause interrupt
+ * remapping table out of bound error if "format" field is 1
+ * and the "index" field has a value that that is larger than
+ * the maximum index of interrupt remapping table.
+ */
+ if ( remap_rte->format == 1 )
+ {
+ remap_rte->format = 0;
+ *IO_APIC_BASE(apic) = reg;
+ *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+0);
+ *IO_APIC_BASE(apic) = reg + 1;
+ *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+1);
+ }
+
+ *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
+ *(IO_APIC_BASE(apic)+4) = value;
+ return;
+ }
+
/* mask the interrupt while we change the intremap table */
saved_mask = remap_rte->mask;
remap_rte->mask = 1;
@@ -473,7 +502,7 @@ int intremap_setup(struct iommu *iommu)
ir_ctrl = iommu_ir_ctrl(iommu);
if ( ir_ctrl->iremap_maddr == 0 )
{
- ir_ctrl->iremap_maddr = alloc_pgtable_maddr();
+ ir_ctrl->iremap_maddr = alloc_pgtable_maddr(NULL);
if ( ir_ctrl->iremap_maddr == 0 )
{
dprintk(XENLOG_WARNING VTDPREFIX,
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 28ce715c6b..2a3310fadf 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -148,7 +148,7 @@ static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
root = &root_entries[bus];
if ( !root_present(*root) )
{
- maddr = alloc_pgtable_maddr();
+ maddr = alloc_pgtable_maddr(NULL);
if ( maddr == 0 )
{
unmap_vtd_domain_page(root_entries);
@@ -205,7 +205,7 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
addr &= (((u64)1) << addr_width) - 1;
spin_lock_irqsave(&hd->mapping_lock, flags);
if ( hd->pgd_maddr == 0 )
- if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr()) == 0) )
+ if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain)) == 0) )
goto out;
parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
@@ -218,7 +218,7 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
{
if ( !alloc )
break;
- maddr = alloc_pgtable_maddr();
+ maddr = alloc_pgtable_maddr(domain);
if ( !maddr )
break;
dma_set_pte_addr(*pte, maddr);
@@ -605,7 +605,7 @@ static int iommu_set_root_entry(struct iommu *iommu)
spin_lock_irqsave(&iommu->register_lock, flags);
if ( iommu->root_maddr == 0 )
- iommu->root_maddr = alloc_pgtable_maddr();
+ iommu->root_maddr = alloc_pgtable_maddr(NULL);
if ( iommu->root_maddr == 0 )
{
spin_unlock_irqrestore(&iommu->register_lock, flags);
@@ -634,7 +634,7 @@ static int iommu_set_root_entry(struct iommu *iommu)
return 0;
}
-static int iommu_enable_translation(struct iommu *iommu)
+static void iommu_enable_translation(struct iommu *iommu)
{
u32 sts;
unsigned long flags;
@@ -661,7 +661,6 @@ static int iommu_enable_translation(struct iommu *iommu)
/* Disable PMRs when VT-d engine takes effect per spec definition */
disable_pmr(iommu);
spin_unlock_irqrestore(&iommu->register_lock, flags);
- return 0;
}
int iommu_disable_translation(struct iommu *iommu)
@@ -1046,8 +1045,7 @@ static int intel_iommu_domain_init(struct domain *d)
for_each_drhd_unit ( drhd )
{
iommu = drhd->iommu;
- if ( iommu_enable_translation(iommu) )
- return -EIO;
+ iommu_enable_translation(iommu);
}
}
@@ -1799,14 +1797,14 @@ static int intel_iommu_group_id(u8 bus, u8 devfn)
}
static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
-int iommu_suspend(void)
+void iommu_suspend(void)
{
struct acpi_drhd_unit *drhd;
struct iommu *iommu;
u32 i;
if ( !vtd_enabled )
- return 0;
+ return;
iommu_flush_all();
@@ -1824,18 +1822,16 @@ int iommu_suspend(void)
iommu_state[i][DMAR_FEUADDR_REG] =
(u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
}
-
- return 0;
}
-int iommu_resume(void)
+void iommu_resume(void)
{
struct acpi_drhd_unit *drhd;
struct iommu *iommu;
u32 i;
if ( !vtd_enabled )
- return 0;
+ return;
iommu_flush_all();
@@ -1855,12 +1851,8 @@ int iommu_resume(void)
(u32) iommu_state[i][DMAR_FEADDR_REG]);
dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
(u32) iommu_state[i][DMAR_FEUADDR_REG]);
-
- if ( iommu_enable_translation(iommu) )
- return -EIO;
+ iommu_enable_translation(iommu);
}
-
- return 0;
}
struct iommu_ops intel_iommu_ops = {
diff --git a/xen/drivers/passthrough/vtd/qinval.c b/xen/drivers/passthrough/vtd/qinval.c
index d90242d708..048089350d 100644
--- a/xen/drivers/passthrough/vtd/qinval.c
+++ b/xen/drivers/passthrough/vtd/qinval.c
@@ -426,7 +426,7 @@ int qinval_setup(struct iommu *iommu)
if ( qi_ctrl->qinval_maddr == 0 )
{
- qi_ctrl->qinval_maddr = alloc_pgtable_maddr();
+ qi_ctrl->qinval_maddr = alloc_pgtable_maddr(NULL);
if ( qi_ctrl->qinval_maddr == 0 )
{
dprintk(XENLOG_WARNING VTDPREFIX,
diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h
index 84cd2e5f8a..ec02d129d8 100644
--- a/xen/drivers/passthrough/vtd/vtd.h
+++ b/xen/drivers/passthrough/vtd/vtd.h
@@ -101,7 +101,7 @@ unsigned int get_cache_line_size(void);
void cacheline_flush(char *);
void flush_all_cache(void);
void *map_to_nocache_virt(int nr_iommus, u64 maddr);
-u64 alloc_pgtable_maddr(void);
+u64 alloc_pgtable_maddr(struct domain *d);
void free_pgtable_maddr(u64 maddr);
void *map_vtd_domain_page(u64 maddr);
void unmap_vtd_domain_page(void *va);
diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
index 83d7704256..31dc561881 100644
--- a/xen/drivers/passthrough/vtd/x86/vtd.c
+++ b/xen/drivers/passthrough/vtd/x86/vtd.c
@@ -22,6 +22,7 @@
#include <xen/domain_page.h>
#include <asm/paging.h>
#include <xen/iommu.h>
+#include <xen/numa.h>
#include "../iommu.h"
#include "../dmar.h"
#include "../vtd.h"
@@ -37,13 +38,13 @@ void unmap_vtd_domain_page(void *va)
}
/* Allocate page table, return its machine address */
-u64 alloc_pgtable_maddr(void)
+u64 alloc_pgtable_maddr(struct domain *d)
{
struct page_info *pg;
u64 *vaddr;
unsigned long mfn;
- pg = alloc_domheap_page(NULL, 0);
+ pg = alloc_domheap_page(NULL, d ? MEMF_node(domain_to_node(d)) : 0);
if ( !pg )
return 0;
mfn = page_to_mfn(pg);
@@ -121,9 +122,9 @@ void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq)
return;
}
/* Multiple mirq may be mapped to one isa irq */
- for ( i = find_first_bit(dpci->mapping, NR_PIRQS);
- i < NR_PIRQS;
- i = find_next_bit(dpci->mapping, NR_PIRQS, i + 1) )
+ for ( i = find_first_bit(dpci->mapping, NR_IRQS);
+ i < NR_IRQS;
+ i = find_next_bit(dpci->mapping, NR_IRQS, i + 1) )
{
list_for_each_entry_safe ( digl, tmp,
&dpci->mirq[i].digl_list, list )
diff --git a/xen/include/asm-ia64/hvm/irq.h b/xen/include/asm-ia64/hvm/irq.h
index d163e56e36..32d0164101 100644
--- a/xen/include/asm-ia64/hvm/irq.h
+++ b/xen/include/asm-ia64/hvm/irq.h
@@ -24,9 +24,7 @@
#include <xen/irq.h>
-#define NR_VECTORS 256
#define VIOAPIC_NUM_PINS 48
-#define NR_PIRQS 256
#include <xen/hvm/irq.h>
diff --git a/xen/include/asm-ia64/linux/asm/irq.h b/xen/include/asm-ia64/linux/asm/irq.h
index 687f49fefa..c0fd725c0d 100644
--- a/xen/include/asm-ia64/linux/asm/irq.h
+++ b/xen/include/asm-ia64/linux/asm/irq.h
@@ -11,6 +11,7 @@
* 02/29/00 D.Mosberger moved most things into hw_irq.h
*/
+#define NR_VECTORS 256
#define NR_IRQS 256
#define NR_IRQ_VECTORS NR_IRQS
diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h
index 5ae806faa9..459978e491 100644
--- a/xen/include/asm-x86/acpi.h
+++ b/xen/include/asm-x86/acpi.h
@@ -166,4 +166,7 @@ extern u8 x86_acpiid_to_apicid[];
extern int acpi_dmar_init(void);
+/* Incremented whenever we transition through S3. Value is 1 during boot. */
+extern uint32_t system_reset_counter;
+
#endif /*__X86_ASM_ACPI_H*/
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index 815eb4a95d..aa9b234370 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -51,6 +51,12 @@
#define NR_CPUS 32
#endif
+#ifdef MAX_PHYS_IRQS
+#define NR_IRQS MAX_PHYS_IRQS
+#else
+#define NR_IRQS 256
+#endif
+
#if defined(__i386__) && (NR_CPUS > 32)
#error "Maximum of 32 physical processors supported by Xen on x86_32"
#endif
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index 18f6aff015..481236c833 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -94,6 +94,7 @@
#define X86_FEATURE_SSE4_2 (4*32+20) /* Streaming SIMD Extensions 4.2 */
#define X86_FEATURE_X2APIC (4*32+21) /* Extended xAPIC */
#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 3acab04f27..1589615363 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -6,7 +6,6 @@
#include <asm/hvm/vcpu.h>
#include <asm/hvm/domain.h>
#include <asm/e820.h>
-#include <asm/pirq.h>
#define has_32bit_shinfo(d) ((d)->arch.has_32bit_shinfo)
#define is_pv_32bit_domain(d) ((d)->arch.is_32bit_pv)
@@ -237,7 +236,7 @@ struct arch_domain
/* NB. protected by d->event_lock and by irq_desc[vector].lock */
int vector_pirq[NR_VECTORS];
- int pirq_vector[NR_PIRQS];
+ s16 pirq_vector[NR_IRQS];
/* Pseudophysical e820 map (XENMEM_memory_map). */
struct e820entry e820[3];
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
new file mode 100644
index 0000000000..d44e622ddc
--- /dev/null
+++ b/xen/include/asm-x86/guest_pt.h
@@ -0,0 +1,291 @@
+/******************************************************************************
+ * xen/asm-x86/guest_pt.h
+ *
+ * Types and accessors for guest pagetable entries, as distinct from
+ * Xen's pagetable types.
+ *
+ * Users must #define GUEST_PAGING_LEVELS to 2, 3 or 4 before including
+ * this file.
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _XEN_ASM_GUEST_PT_H
+#define _XEN_ASM_GUEST_PT_H
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(unsigned long,gfn)
+#define PRI_gfn "05lx"
+
+#define VALID_GFN(m) (m != INVALID_GFN)
+
+static inline int
+valid_gfn(gfn_t m)
+{
+ return VALID_GFN(gfn_x(m));
+}
+
+static inline paddr_t
+gfn_to_paddr(gfn_t gfn)
+{
+ return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
+}
+
+/* Override gfn_to_mfn to work with gfn_t */
+#undef gfn_to_mfn
+#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), gfn_x(g), (t))
+
+
+/* Types of the guest's page tables and access functions for them */
+
+#if GUEST_PAGING_LEVELS == 2
+
+#define GUEST_L1_PAGETABLE_ENTRIES 1024
+#define GUEST_L2_PAGETABLE_ENTRIES 1024
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 22
+
+typedef uint32_t guest_intpte_t;
+typedef struct { guest_intpte_t l1; } guest_l1e_t;
+typedef struct { guest_intpte_t l2; } guest_l2e_t;
+
+#define PRI_gpte "08x"
+
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return ((paddr_t) gl1e.l1) & (PADDR_MASK & PAGE_MASK); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return ((paddr_t) gl2e.l2) & (PADDR_MASK & PAGE_MASK); }
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(guest_l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(guest_l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return gl1e.l1 & 0xfff; }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return gl2e.l2 & 0xfff; }
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return (guest_l1e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return (guest_l2e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; }
+
+#define guest_l1_table_offset(_va) \
+ (((_va) >> GUEST_L1_PAGETABLE_SHIFT) & (GUEST_L1_PAGETABLE_ENTRIES - 1))
+#define guest_l2_table_offset(_va) \
+ (((_va) >> GUEST_L2_PAGETABLE_SHIFT) & (GUEST_L2_PAGETABLE_ENTRIES - 1))
+
+#else /* GUEST_PAGING_LEVELS != 2 */
+
+#if GUEST_PAGING_LEVELS == 3
+#define GUEST_L1_PAGETABLE_ENTRIES 512
+#define GUEST_L2_PAGETABLE_ENTRIES 512
+#define GUEST_L3_PAGETABLE_ENTRIES 4
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 21
+#define GUEST_L3_PAGETABLE_SHIFT 30
+#else /* GUEST_PAGING_LEVELS == 4 */
+#define GUEST_L1_PAGETABLE_ENTRIES 512
+#define GUEST_L2_PAGETABLE_ENTRIES 512
+#define GUEST_L3_PAGETABLE_ENTRIES 512
+#define GUEST_L4_PAGETABLE_ENTRIES 512
+#define GUEST_L1_PAGETABLE_SHIFT 12
+#define GUEST_L2_PAGETABLE_SHIFT 21
+#define GUEST_L3_PAGETABLE_SHIFT 30
+#define GUEST_L4_PAGETABLE_SHIFT 39
+#endif
+
+typedef l1_pgentry_t guest_l1e_t;
+typedef l2_pgentry_t guest_l2e_t;
+typedef l3_pgentry_t guest_l3e_t;
+#if GUEST_PAGING_LEVELS >= 4
+typedef l4_pgentry_t guest_l4e_t;
+#endif
+typedef intpte_t guest_intpte_t;
+
+#define PRI_gpte "016"PRIx64
+
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr(gl2e); }
+static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
+{ return l3e_get_paddr(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
+{ return l4e_get_paddr(gl4e); }
+#endif
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
+{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
+{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
+#endif
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags(gl2e); }
+static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
+{ return l3e_get_flags(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
+{ return l4e_get_flags(gl4e); }
+#endif
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
+{ return l3e_from_pfn(gfn_x(gfn), flags); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
+{ return l4e_from_pfn(gfn_x(gfn), flags); }
+#endif
+
+#define guest_l1_table_offset(a) l1_table_offset(a)
+#define guest_l2_table_offset(a) l2_table_offset(a)
+#define guest_l3_table_offset(a) l3_table_offset(a)
+#define guest_l4_table_offset(a) l4_table_offset(a)
+
+#endif /* GUEST_PAGING_LEVELS != 2 */
+
+
+/* Which pagetable features are supported on this vcpu? */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+ /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
+ * CR4.PSE is set or the guest is in PAE or long mode.
+ * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
+ return (is_hvm_vcpu(v) &&
+ (GUEST_PAGING_LEVELS != 2
+ || !hvm_paging_enabled(v)
+ || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+ if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
+ return 0;
+ if ( !is_hvm_vcpu(v) )
+ return cpu_has_nx;
+ return hvm_nx_enabled(v);
+}
+
+
+
+/* Type used for recording a walk through guest pagetables. It is
+ * filled in by the pagetable walk function, and also used as a cache
+ * for later walks. When we encounter a superpage l2e, we fabricate an
+ * l1e for propagation to the shadow (for splintering guest superpages
+ * into many shadow l1 entries). */
+typedef struct guest_pagetable_walk walk_t;
+struct guest_pagetable_walk
+{
+ unsigned long va; /* Address we were looking for */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ guest_l4e_t l4e; /* Guest's level 4 entry */
+#endif
+ guest_l3e_t l3e; /* Guest's level 3 entry */
+#endif
+ guest_l2e_t l2e; /* Guest's level 2 entry */
+ guest_l1e_t l1e; /* Guest's level 1 entry (or fabrication) */
+#if GUEST_PAGING_LEVELS >= 4
+ mfn_t l4mfn; /* MFN that the level 4 entry was in */
+ mfn_t l3mfn; /* MFN that the level 3 entry was in */
+#endif
+ mfn_t l2mfn; /* MFN that the level 2 entry was in */
+ mfn_t l1mfn; /* MFN that the level 1 entry was in */
+};
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+ return _gfn(INVALID_GFN);
+ return guest_l1e_get_gfn(gw->l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+ return 0;
+ return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
+}
+
+/* Walk the guest pagetables, after the manner of a hardware walker.
+ *
+ * Inputs: a vcpu, a virtual address, a walk_t to fill, a
+ * pointer to a pagefault code, the MFN of the guest's
+ * top-level pagetable, and a mapping of the
+ * guest's top-level pagetable.
+ *
+ * We walk the vcpu's guest pagetables, filling the walk_t with what we
+ * see and adding any Accessed and Dirty bits that are needed in the
+ * guest entries. Using the pagefault code, we check the permissions as
+ * we go. For the purposes of reading pagetables we treat all non-RAM
+ * memory as contining zeroes.
+ *
+ * Returns 0 for success, or the set of permission bits that we failed on
+ * if the walk did not complete. */
+
+/* Macro-fu so you can call guest_walk_tables() and get the right one. */
+#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels
+#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
+#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
+
+extern uint32_t
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw,
+ uint32_t pfec, mfn_t top_mfn, void *top_map);
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+ gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ gdprintk(XENLOG_INFO, " l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
+ gdprintk(XENLOG_INFO, " l4e=%" PRI_gpte "\n", gw->l4e.l4);
+ gdprintk(XENLOG_INFO, " l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
+#endif /* PAE or 64... */
+ gdprintk(XENLOG_INFO, " l3e=%" PRI_gpte "\n", gw->l3e.l3);
+#endif /* All levels... */
+ gdprintk(XENLOG_INFO, " l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
+ gdprintk(XENLOG_INFO, " l2e=%" PRI_gpte "\n", gw->l2e.l2);
+ gdprintk(XENLOG_INFO, " l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
+ gdprintk(XENLOG_INFO, " l1e=%" PRI_gpte "\n", gw->l1e.l1);
+}
+
+#endif /* _XEN_ASM_GUEST_PT_H */
diff --git a/xen/include/asm-x86/hpet.h b/xen/include/asm-x86/hpet.h
index b63f56805d..82c08bc3c5 100644
--- a/xen/include/asm-x86/hpet.h
+++ b/xen/include/asm-x86/hpet.h
@@ -24,6 +24,10 @@
#define HPET_T2_CMP 0x148
#define HPET_T2_ROUTE 0x150
+#define HPET_Tn_CFG(n) (HPET_T0_CFG + n * 0x20)
+#define HPET_Tn_CMP(n) (HPET_T0_CMP + n * 0x20)
+#define HPET_Tn_ROUTE(n) (HPET_T0_ROUTE + n * 0x20)
+
#define HPET_ID_VENDOR 0xffff0000
#define HPET_ID_LEGSUP 0x00008000
#define HPET_ID_NUMBER 0x00001f00
diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h
index f2220526fd..1f23124279 100644
--- a/xen/include/asm-x86/hvm/irq.h
+++ b/xen/include/asm-x86/hvm/irq.h
@@ -22,7 +22,6 @@
#ifndef __ASM_X86_HVM_IRQ_H__
#define __ASM_X86_HVM_IRQ_H__
-#include <asm/pirq.h>
#include <xen/hvm/irq.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/vpic.h>
diff --git a/xen/include/asm-x86/hvm/vlapic.h b/xen/include/asm-x86/hvm/vlapic.h
index 3f34e47950..8c36ed5a00 100644
--- a/xen/include/asm-x86/hvm/vlapic.h
+++ b/xen/include/asm-x86/hvm/vlapic.h
@@ -93,8 +93,7 @@ void vlapic_msr_set(struct vlapic *vlapic, uint64_t value);
int vlapic_accept_pic_intr(struct vcpu *v);
-struct vlapic *apic_round_robin(
- struct domain *d, uint8_t vector, uint32_t bitmap);
+struct vlapic *apic_lowest_prio(struct domain *d, uint32_t bitmap);
int vlapic_match_logical_addr(struct vlapic *vlapic, uint8_t mda);
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
index 0430a46c1b..563e8aea36 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -351,9 +351,9 @@ static inline int __vmxon(u64 addr)
return rc;
}
-void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code);
-void vmx_inject_extint(struct vcpu *v, int trap);
-void vmx_inject_nmi(struct vcpu *v);
+void vmx_inject_hw_exception(int trap, int error_code);
+void vmx_inject_extint(int trap);
+void vmx_inject_nmi(void);
void ept_p2m_init(struct domain *d);
diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h
index 982f99f3c4..920ac7f85e 100644
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -18,7 +18,7 @@
#define vector_to_irq(vec) (vector_irq[vec])
extern int vector_irq[NR_VECTORS];
-extern u8 irq_vector[NR_IRQ_VECTORS];
+extern u8 irq_vector[NR_IRQS];
#define AUTO_ASSIGN -1
#define NEVER_ASSIGN -2
#define FREE_TO_ASSIGN -3
diff --git a/xen/include/asm-x86/mach-default/irq_vectors.h b/xen/include/asm-x86/mach-default/irq_vectors.h
index 90b4e1ef0e..057b2a35b8 100644
--- a/xen/include/asm-x86/mach-default/irq_vectors.h
+++ b/xen/include/asm-x86/mach-default/irq_vectors.h
@@ -30,8 +30,4 @@
#define NR_VECTORS 256
-/* Limited by number of trap vectors. */
-#define NR_IRQS NR_VECTORS
-#define NR_IRQ_VECTORS NR_IRQS
-
#endif /* _ASM_IRQ_VECTORS_H */
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 005b6603e2..d017c4cb56 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -263,6 +263,7 @@ pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab);
int check_descriptor(const struct domain *, struct desc_struct *d);
+extern int opt_allow_hugepage;
/******************************************************************************
* With shadow pagetables, the different kinds of address start
diff --git a/xen/include/asm-x86/msi.h b/xen/include/asm-x86/msi.h
index c72f9d69c5..6ca1a76898 100644
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -69,9 +69,9 @@ struct msi_msg {
};
/* Helper functions */
-extern void mask_msi_irq(unsigned int irq);
-extern void unmask_msi_irq(unsigned int irq);
-extern void set_msi_irq_affinity(unsigned int irq, cpumask_t mask);
+extern void mask_msi_vector(unsigned int vector);
+extern void unmask_msi_vector(unsigned int vector);
+extern void set_msi_affinity(unsigned int vector, cpumask_t mask);
extern int pci_enable_msi(struct msi_info *msi);
extern void pci_disable_msi(int vector);
extern void pci_cleanup_msi(struct pci_dev *pdev);
@@ -97,6 +97,8 @@ struct msi_desc {
int remap_index; /* index in interrupt remapping table */
};
+int msi_maskable_irq(const struct msi_desc *);
+
/*
* Assume the maximum number of hot plug slots supported by the system is about
* ten. The worstcase is that each of these slots is hot-added with a device,
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index e17a9469e2..9ccfdb8502 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -215,7 +215,10 @@ void clear_page_sse2(void *);
#define clear_page(_p) (cpu_has_xmm2 ? \
clear_page_sse2((void *)(_p)) : \
(void)memset((void *)(_p), 0, PAGE_SIZE))
-#define copy_page(_t,_f) memcpy((void *)(_t), (void *)(_f), PAGE_SIZE)
+void copy_page_sse2(void *, const void *);
+#define copy_page(_t,_f) (cpu_has_xmm2 ? \
+ copy_page_sse2(_t, _f) : \
+ (void)memcpy(_t, _f, PAGE_SIZE))
#define mfn_valid(mfn) ((mfn) < max_page)
@@ -278,7 +281,6 @@ extern unsigned int m2p_compat_vstart;
#endif
void paging_init(void);
void setup_idle_pagetable(void);
-unsigned long clone_idle_pagetable(struct vcpu *);
#endif /* !defined(__ASSEMBLY__) */
#define _PAGE_PRESENT 0x001U
diff --git a/xen/include/asm-x86/perfc_defn.h b/xen/include/asm-x86/perfc_defn.h
index 784aa9eb5a..47739fa19b 100644
--- a/xen/include/asm-x86/perfc_defn.h
+++ b/xen/include/asm-x86/perfc_defn.h
@@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations, "writable pt emulations")
PERFCOUNTER(exception_fixed, "pre-exception fixed")
+PERFCOUNTER(guest_walk, "guest pagetable walks")
/* Shadow counters */
PERFCOUNTER(shadow_alloc, "calls to shadow_alloc")
@@ -92,7 +93,6 @@ PERFCOUNTER(shadow_unshadow, "shadow unshadows a page")
PERFCOUNTER(shadow_up_pointer, "shadow unshadow by up-pointer")
PERFCOUNTER(shadow_unshadow_bf, "shadow unshadow brute-force")
PERFCOUNTER(shadow_get_page_fail, "shadow_get_page_from_l1e failed")
-PERFCOUNTER(shadow_guest_walk, "shadow walks guest tables")
PERFCOUNTER(shadow_check_gwalk, "shadow checks gwalk")
PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
PERFCOUNTER(shadow_rm_write_flush_tlb,
diff --git a/xen/include/asm-x86/pirq.h b/xen/include/asm-x86/pirq.h
deleted file mode 100644
index 2041262134..0000000000
--- a/xen/include/asm-x86/pirq.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __XEN_PIRQ_H
-#define __XEN_PIRQ_H
-
-#define PIRQ_BASE 0
-#define NR_PIRQS 256
-
-#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
-#define NR_DYNIRQS 256
-
-#endif /* __XEN_PIRQ_H */
-
diff --git a/xen/include/asm-x86/x86_32/page.h b/xen/include/asm-x86/x86_32/page.h
index 16659a1ae3..aef51f51af 100644
--- a/xen/include/asm-x86/x86_32/page.h
+++ b/xen/include/asm-x86/x86_32/page.h
@@ -112,7 +112,7 @@ extern unsigned int PAGE_HYPERVISOR_NOCACHE;
#define BASE_DISALLOW_MASK (0xFFFFF198U & ~_PAGE_NX)
#define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
-#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
+#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
#define L3_DISALLOW_MASK 0xFFFFF1FEU /* must-be-zero */
#endif /* __X86_32_PAGE_H__ */
diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h
index 948cd656f0..ac44a9a1c1 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -115,7 +115,7 @@ typedef l4_pgentry_t root_pgentry_t;
#define BASE_DISALLOW_MASK (0xFF800198U & ~_PAGE_NX)
#define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
-#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
+#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
#define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
#define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
diff --git a/xen/include/public/features.h b/xen/include/public/features.h
index 16e5ee4d49..879131cda1 100644
--- a/xen/include/public/features.h
+++ b/xen/include/public/features.h
@@ -62,6 +62,12 @@
/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
#define XENFEAT_highmem_assist 6
+/*
+ * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel
+ * available pte bits.
+ */
+#define XENFEAT_gnttab_map_avail_bits 7
+
#define XENFEAT_NR_SUBMAPS 1
#endif /* __XEN_PUBLIC_FEATURES_H__ */
diff --git a/xen/include/public/grant_table.h b/xen/include/public/grant_table.h
index 26f2c35b18..ad116e71e1 100644
--- a/xen/include/public/grant_table.h
+++ b/xen/include/public/grant_table.h
@@ -360,7 +360,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
/*
- * Bitfield values for update_pin_status.flags.
+ * Bitfield values for gnttab_map_grant_ref.flags.
*/
/* Map the grant entry for access by I/O devices. */
#define _GNTMAP_device_map (0)
@@ -388,6 +388,13 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
#define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte)
/*
+ * Bits to be placed in guest kernel available PTE bits (architecture
+ * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set).
+ */
+#define _GNTMAP_guest_avail0 (16)
+#define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0)
+
+/*
* Values for error status returns. All errors are -ve.
*/
#define GNTST_okay (0) /* Normal return. */
diff --git a/xen/include/public/io/pciif.h b/xen/include/public/io/pciif.h
index 0a0ffcc6e2..7e75392599 100644
--- a/xen/include/public/io/pciif.h
+++ b/xen/include/public/io/pciif.h
@@ -30,14 +30,22 @@
/* xen_pci_sharedinfo flags */
#define _XEN_PCIF_active (0)
#define XEN_PCIF_active (1<<_XEN_PCI_active)
+#define _XEN_PCIB_AERHANDLER (1)
+#define XEN_PCIB_AERHANDLER (1<<_XEN_PCIB_AERHANDLER)
+#define _XEN_PCIB_active (2)
+#define XEN_PCIB_active (1<<_XEN_PCIB_active)
/* xen_pci_op commands */
-#define XEN_PCI_OP_conf_read (0)
-#define XEN_PCI_OP_conf_write (1)
-#define XEN_PCI_OP_enable_msi (2)
-#define XEN_PCI_OP_disable_msi (3)
-#define XEN_PCI_OP_enable_msix (4)
-#define XEN_PCI_OP_disable_msix (5)
+#define XEN_PCI_OP_conf_read (0)
+#define XEN_PCI_OP_conf_write (1)
+#define XEN_PCI_OP_enable_msi (2)
+#define XEN_PCI_OP_disable_msi (3)
+#define XEN_PCI_OP_enable_msix (4)
+#define XEN_PCI_OP_disable_msix (5)
+#define XEN_PCI_OP_aer_detected (6)
+#define XEN_PCI_OP_aer_resume (7)
+#define XEN_PCI_OP_aer_mmio (8)
+#define XEN_PCI_OP_aer_slotreset (9)
/* xen_pci_op error numbers */
#define XEN_PCI_ERR_success (0)
@@ -82,10 +90,25 @@ struct xen_pci_op {
struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC];
};
+/*used for pcie aer handling*/
+struct xen_pcie_aer_op
+{
+
+ /* IN: what action to perform: XEN_PCI_OP_* */
+ uint32_t cmd;
+ /*IN/OUT: return aer_op result or carry error_detected state as input*/
+ int32_t err;
+
+ /* IN: which device to touch */
+ uint32_t domain; /* PCI Domain/Segment*/
+ uint32_t bus;
+ uint32_t devfn;
+};
struct xen_pci_sharedinfo {
/* flags - XEN_PCIF_* */
uint32_t flags;
struct xen_pci_op op;
+ struct xen_pcie_aer_op aer_op;
};
#endif /* __XEN_PCI_COMMON_H__ */
diff --git a/xen/include/public/kexec.h b/xen/include/public/kexec.h
index fc19f2fe50..04252226a1 100644
--- a/xen/include/public/kexec.h
+++ b/xen/include/public/kexec.h
@@ -155,27 +155,6 @@ typedef struct xen_kexec_range {
unsigned long start;
} xen_kexec_range_t;
-/* vmcoreinfo stuff */
-#define VMCOREINFO_BYTES (4096)
-#define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN"
-void arch_crash_save_vmcoreinfo(void);
-void vmcoreinfo_append_str(const char *fmt, ...)
- __attribute__ ((format (printf, 1, 2)));
-#define VMCOREINFO_PAGESIZE(value) \
- vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
- vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \
- vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name)
-#define VMCOREINFO_STRUCT_SIZE(name) \
- vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
- vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
- (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \
- vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #alias, \
- (unsigned long)offsetof(struct name, field))
-
#endif /* _XEN_PUBLIC_KEXEC_H */
/*
diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
index e77239f290..a89e2e9f88 100644
--- a/xen/include/xen/hvm/irq.h
+++ b/xen/include/xen/hvm/irq.h
@@ -63,7 +63,7 @@ struct hvm_girq_dpci_mapping {
/* Protected by domain's event_lock */
struct hvm_irq_dpci {
/* Machine IRQ to guest device/intx mapping. */
- DECLARE_BITMAP(mapping, NR_PIRQS);
+ DECLARE_BITMAP(mapping, NR_IRQS);
struct hvm_mirq_dpci_mapping mirq[NR_IRQS];
/* Guest IRQ to guest device/intx mapping. */
struct hvm_girq_dpci_mapping girq[NR_IRQS];
diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h
index 7d58109ec2..3997b2f96a 100644
--- a/xen/include/xen/hypercall.h
+++ b/xen/include/xen/hypercall.h
@@ -124,6 +124,12 @@ compat_memory_op(
unsigned int cmd,
XEN_GUEST_HANDLE(void) arg);
+extern int
+compat_vcpu_op(
+ int cmd,
+ int vcpuid,
+ XEN_GUEST_HANDLE(void) arg);
+
#endif
#endif /* __XEN_HYPERCALL_H__ */
diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
index f230df7b8e..d68b41200c 100644
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -110,7 +110,7 @@ struct iommu_ops {
void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value);
void iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg);
-int iommu_suspend(void);
-int iommu_resume(void);
+void iommu_suspend(void);
+void iommu_resume(void);
#endif /* _IOMMU_H_ */
diff --git a/xen/include/xen/irq.h b/xen/include/xen/irq.h
index a4dd3f6333..5b88079ddb 100644
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -61,7 +61,7 @@ typedef struct {
cpumask_t affinity;
} __cacheline_aligned irq_desc_t;
-extern irq_desc_t irq_desc[NR_IRQS];
+extern irq_desc_t irq_desc[NR_VECTORS];
extern int setup_irq(unsigned int, struct irqaction *);
extern void free_irq(unsigned int);
@@ -81,13 +81,16 @@ extern void pirq_guest_unbind(struct domain *d, int irq);
extern irq_desc_t *domain_spin_lock_irq_desc(
struct domain *d, int irq, unsigned long *pflags);
-static inline void set_native_irq_info(int irq, cpumask_t mask)
+static inline void set_native_irq_info(unsigned int vector, cpumask_t mask)
{
- irq_desc[irq].affinity = mask;
+ irq_desc[vector].affinity = mask;
}
+#ifdef irq_to_vector
static inline void set_irq_info(int irq, cpumask_t mask)
{
- set_native_irq_info(irq, mask);
+ set_native_irq_info(irq_to_vector(irq), mask);
}
+#endif
+
#endif /* __XEN_IRQ_H__ */
diff --git a/xen/include/xen/kexec.h b/xen/include/xen/kexec.h
index 9dc3dacac0..d78510e639 100644
--- a/xen/include/xen/kexec.h
+++ b/xen/include/xen/kexec.h
@@ -33,6 +33,27 @@ crash_xen_info_t *kexec_crash_save_info(void);
void machine_crash_shutdown(void);
int machine_kexec_get(xen_kexec_range_t *range);
+/* vmcoreinfo stuff */
+#define VMCOREINFO_BYTES (4096)
+#define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN"
+void arch_crash_save_vmcoreinfo(void);
+void vmcoreinfo_append_str(const char *fmt, ...)
+ __attribute__ ((format (printf, 1, 2)));
+#define VMCOREINFO_PAGESIZE(value) \
+ vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \
+ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name)
+#define VMCOREINFO_STRUCT_SIZE(name) \
+ vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+ (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \
+ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #alias, \
+ (unsigned long)offsetof(struct name, field))
+
#endif /* __XEN_KEXEC_H__ */
/*