diff options
141 files changed, 2368 insertions, 1477 deletions
diff --git a/extras/mini-os/include/posix/net/if.h b/extras/mini-os/include/posix/net/if.h new file mode 100644 index 0000000000..5be77d4f49 --- /dev/null +++ b/extras/mini-os/include/posix/net/if.h @@ -0,0 +1,85 @@ +/* + * This code is mostly taken from NetBSD net/if.h + * Changes: Stefano Stabellini <stefano.stabellini@eu.citrix.com> + * + ****************************************************************************** + * + * Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by William Studenmund and Jason R. Thorpe. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef _NET_IF_H_ +#define _NET_IF_H_ + +/* + * Length of interface external name, including terminating '\0'. + * Note: this is the same size as a generic device's external name. + */ +#define IF_NAMESIZE 16 + +struct if_nameindex { + unsigned int if_index; /* 1, 2, ... */ + char *if_name; /* null terminated name: "le0", ... */ +}; + +unsigned int if_nametoindex(const char *); +char * if_indextoname(unsigned int, char *); +struct if_nameindex * if_nameindex(void); +void if_freenameindex(struct if_nameindex *); + +#endif /* !_NET_IF_H_ */ + diff --git a/extras/mini-os/lib/sys.c b/extras/mini-os/lib/sys.c index 34e4fb6666..a07692b883 100644 --- a/extras/mini-os/lib/sys.c +++ b/extras/mini-os/lib/sys.c @@ -34,6 +34,7 @@ #include <sys/unistd.h> #include <sys/stat.h> #include <sys/mman.h> +#include <net/if.h> #include <time.h> #include <errno.h> #include <fcntl.h> @@ -1324,6 +1325,12 @@ unsupported_function(int, tcsetattr, -1); unsupported_function(int, tcgetattr, 0); unsupported_function(int, poll, -1); +/* net/if.h */ +unsupported_function_log(unsigned int, if_nametoindex, -1); +unsupported_function_log(char *, if_indextoname, (char *) NULL); +unsupported_function_log(struct if_nameindex *, if_nameindex, (struct if_nameindex *) NULL); +unsupported_function_crash(if_freenameindex); + /* Linuxish abi for the Caml runtime, don't support */ unsupported_function_log(struct dirent *, readdir64, NULL); unsupported_function_log(int, getrusage, -1); diff --git a/tools/Makefile b/tools/Makefile index 09964323e9..0d872b6fa4 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -9,6 +9,7 @@ SUBDIRS-y += flask SUBDIRS-y += xenstore SUBDIRS-y += misc SUBDIRS-y += examples +SUBDIRS-y += hotplug SUBDIRS-y += xentrace SUBDIRS-$(CONFIG_XCUTILS) += xcutils SUBDIRS-$(CONFIG_X86) += firmware diff --git a/tools/examples/Makefile b/tools/examples/Makefile index 39310394f5..6c8349b154 100644 --- a/tools/examples/Makefile +++ b/tools/examples/Makefile @@ -24,41 +24,6 @@ XEN_CONFIGS += xmexample.vti XEN_CONFIGS += xend-pci-quirks.sxp XEN_CONFIGS += xend-pci-permissive.sxp -# Xen script dir and scripts to go there. -XEN_SCRIPT_DIR = /etc/xen/scripts -XEN_SCRIPTS = network-bridge vif-bridge -XEN_SCRIPTS += network-route vif-route -XEN_SCRIPTS += network-nat vif-nat -XEN_SCRIPTS += block -XEN_SCRIPTS += block-enbd block-nbd -XEN_SCRIPTS += blktap -XEN_SCRIPTS += vtpm vtpm-delete -XEN_SCRIPTS += xen-hotplug-cleanup -XEN_SCRIPTS += external-device-migrate -XEN_SCRIPTS += vscsi -XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh -XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh -XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh -XEN_SCRIPT_DATA += vtpm-migration.sh vtpm-impl - -XEN_HOTPLUG_DIR = /etc/hotplug -XEN_HOTPLUG_SCRIPTS = xen-backend.agent - -UDEV_RULES_DIR = /etc/udev -UDEV_RULES = xen-backend.rules - -DI = $(if $(DISTDIR),$(shell readlink -f $(DISTDIR)),) -DE = $(if $(DESTDIR),$(shell readlink -f $(DESTDIR)),) -ifeq ($(findstring $(DI),$(DE)),$(DI)) -HOTPLUGS=install-hotplug install-udev -else -ifeq ($(shell [ -x /usr/bin/udevinfo ] && [ `/usr/bin/udevinfo -V | sed -e 's/^[^0-9]* \([0-9]\{1,\}\)[^0-9]\{0,\}/\1/'` -ge 059 ] && echo 1),1) -HOTPLUGS=install-udev -else -HOTPLUGS=install-hotplug -endif -endif - .PHONY: all all: @@ -66,7 +31,7 @@ all: build: .PHONY: install -install: all install-readmes install-initd install-configs install-scripts $(HOTPLUGS) +install: all install-readmes install-configs $(HOTPLUGS) .PHONY: install-readmes install-readmes: @@ -77,14 +42,6 @@ install-readmes: $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_CONFIG_DIR); \ done -.PHONY: install-initd -install-initd: - [ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d - [ -d $(DESTDIR)/etc/sysconfig ] || $(INSTALL_DIR) $(DESTDIR)/etc/sysconfig - $(INSTALL_PROG) $(XEND_INITD) $(DESTDIR)/etc/init.d - $(INSTALL_PROG) $(XENDOMAINS_INITD) $(DESTDIR)/etc/init.d - $(INSTALL_PROG) $(XENDOMAINS_SYSCONFIG) $(DESTDIR)/etc/sysconfig/xendomains - .PHONY: install-configs install-configs: $(XEN_CONFIGS) [ -d $(DESTDIR)$(XEN_CONFIG_DIR) ] || \ @@ -96,19 +53,6 @@ install-configs: $(XEN_CONFIGS) $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_CONFIG_DIR); \ done -.PHONY: install-scripts -install-scripts: - [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \ - $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR) - set -e; for i in $(XEN_SCRIPTS); \ - do \ - $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ - done - set -e; for i in $(XEN_SCRIPT_DATA); \ - do \ - $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ - done - .PHONY: install-hotplug install-hotplug: [ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \ diff --git a/tools/firmware/hvmloader/config.h b/tools/firmware/hvmloader/config.h index 32011cd5a1..ea0c435ae4 100644 --- a/tools/firmware/hvmloader/config.h +++ b/tools/firmware/hvmloader/config.h @@ -23,7 +23,6 @@ /* Memory map. */ #define HYPERCALL_PHYSICAL_ADDRESS 0x00080000 #define VGABIOS_PHYSICAL_ADDRESS 0x000C0000 -#define ETHERBOOT_PHYSICAL_ADDRESS 0x000D0000 #define SMBIOS_PHYSICAL_ADDRESS 0x000E9000 #define SMBIOS_MAXIMUM_SIZE 0x00001000 #define ACPI_PHYSICAL_ADDRESS 0x000EA000 diff --git a/tools/firmware/hvmloader/hvmloader.c b/tools/firmware/hvmloader/hvmloader.c index 9dff7cc08d..ec066cbf68 100644 --- a/tools/firmware/hvmloader/hvmloader.c +++ b/tools/firmware/hvmloader/hvmloader.c @@ -322,60 +322,56 @@ static void pci_setup(void) } /* - * Scan the PCI bus for the first NIC supported by etherboot, and copy - * the corresponding rom data to *copy_rom_dest. Returns the length of the - * selected rom, or 0 if no NIC found. + * Scan the list of Option ROMs at @roms for one which supports + * PCI (@vendor_id, @device_id) found at slot @devfn. If one is found, + * copy it to @dest and return its size rounded up to a multiple 2kB. This + * function will not copy ROMs beyond address 0xE0000. */ -static int scan_etherboot_nic(void *copy_rom_dest) +#define round_option_rom(x) (((x) + 2047) & ~2047) +static int scan_option_rom( + uint8_t devfn, uint16_t vendor_id, uint16_t device_id, + void *roms, uint32_t dest) { struct option_rom_header *rom; struct option_rom_pnp_header *pnph; struct option_rom_pci_header *pcih; - uint32_t devfn; - uint16_t class, vendor_id, device_id; uint8_t csum; int i; - for ( devfn = 0; devfn < 128; devfn++ ) - { - class = pci_readw(devfn, PCI_CLASS_DEVICE); - vendor_id = pci_readw(devfn, PCI_VENDOR_ID); - device_id = pci_readw(devfn, PCI_DEVICE_ID); + static uint32_t orom_ids[64]; + static int nr_roms; - if ( (vendor_id == 0xffff) && (device_id == 0xffff) ) - continue; + /* Avoid duplicate ROMs. */ + for ( i = 0; i < nr_roms; i++ ) + if ( orom_ids[i] == (vendor_id | ((uint32_t)device_id << 16)) ) + return 0; - /* We're only interested in NICs. */ - if ( class != 0x0200 ) - continue; + rom = roms; + for ( ; ; ) + { + /* Invalid signature means we're out of option ROMs. */ + if ( strncmp((char *)rom->signature, "\x55\xaa", 2) || + (rom->rom_size == 0) ) + break; - rom = (struct option_rom_header *)etherboot; - for ( ; ; ) - { - /* Invalid signature means we're out of option ROMs. */ - if ( strncmp((char *)rom->signature, "\x55\xaa", 2) || - (rom->rom_size == 0) ) - break; - - /* Invalid checksum means we're out of option ROMs. */ - csum = 0; - for ( i = 0; i < (rom->rom_size * 512); i++ ) - csum += ((uint8_t *)rom)[i]; - if ( csum != 0 ) - break; - - /* Check the PCI PnP header (if any) for a match. */ - pcih = (struct option_rom_pci_header *) - ((char *)rom + rom->pci_header_offset); - if ( (rom->pci_header_offset != 0) && - !strncmp((char *)pcih->signature, "PCIR", 4) && - (pcih->vendor_id == vendor_id) && - (pcih->device_id == device_id) ) - goto found; - - rom = (struct option_rom_header *) - ((char *)rom + rom->rom_size * 512); - } + /* Invalid checksum means we're out of option ROMs. */ + csum = 0; + for ( i = 0; i < (rom->rom_size * 512); i++ ) + csum += ((uint8_t *)rom)[i]; + if ( csum != 0 ) + break; + + /* Check the PCI PnP header (if any) for a match. */ + pcih = (struct option_rom_pci_header *) + ((char *)rom + rom->pci_header_offset); + if ( (rom->pci_header_offset != 0) && + !strncmp((char *)pcih->signature, "PCIR", 4) && + (pcih->vendor_id == vendor_id) && + (pcih->device_id == device_id) ) + goto found; + + rom = (struct option_rom_header *) + ((char *)rom + rom->rom_size * 512); } return 0; @@ -392,15 +388,96 @@ static int scan_etherboot_nic(void *copy_rom_dest) ((char *)rom + pnph->next_header_offset)) : ((struct option_rom_pnp_header *)NULL)); - printf("Loading PXE ROM ...\n"); + printf("Loading PCI Option ROM ...\n"); if ( (pnph != NULL) && (pnph->manufacturer_name_offset != 0) ) printf(" - Manufacturer: %s\n", (char *)rom + pnph->manufacturer_name_offset); if ( (pnph != NULL) && (pnph->product_name_offset != 0) ) printf(" - Product name: %s\n", (char *)rom + pnph->product_name_offset); - memcpy(copy_rom_dest, rom, rom->rom_size * 512); - return rom->rom_size * 512; + + if ( (dest + rom->rom_size * 512 + 1) > 0xe0000u ) + { + printf("Option ROM size %x exceeds available space\n", + rom->rom_size * 512); + return 0; + } + + orom_ids[nr_roms++] = vendor_id | ((uint32_t)device_id << 16); + memcpy((void *)dest, rom, rom->rom_size * 512); + *(uint8_t *)(dest + rom->rom_size * 512) = devfn; + return round_option_rom(rom->rom_size * 512 + 1); +} + +/* + * Scan the PCI bus for the first NIC supported by etherboot, and copy + * the corresponding rom data to *copy_rom_dest. Returns the length of the + * selected rom, or 0 if no NIC found. + */ +static int scan_etherboot_nic(uint32_t copy_rom_dest) +{ + uint8_t devfn; + uint16_t class, vendor_id, device_id; + + for ( devfn = 0; devfn < 128; devfn++ ) + { + class = pci_readw(devfn, PCI_CLASS_DEVICE); + vendor_id = pci_readw(devfn, PCI_VENDOR_ID); + device_id = pci_readw(devfn, PCI_DEVICE_ID); + + /* We're only interested in NICs. */ + if ( (vendor_id != 0xffff) && + (device_id != 0xffff) && + (class == 0x0200) ) + return scan_option_rom( + devfn, vendor_id, device_id, etherboot, copy_rom_dest); + } + + return 0; +} + +/* + * Scan the PCI bus for the devices that have an option ROM, and copy + * the corresponding rom data to rom_phys_addr. + */ +static int pci_load_option_roms(uint32_t rom_base_addr) +{ + uint32_t option_rom_addr, rom_phys_addr = rom_base_addr; + uint16_t vendor_id, device_id; + uint8_t devfn, class; + + for ( devfn = 0; devfn < 128; devfn++ ) + { + class = pci_readb(devfn, PCI_CLASS_DEVICE + 1); + vendor_id = pci_readw(devfn, PCI_VENDOR_ID); + device_id = pci_readw(devfn, PCI_DEVICE_ID); + + if ( (vendor_id == 0xffff) && (device_id == 0xffff) ) + continue; + + /* + * Currently only scan options from mass storage devices and serial + * bus controller (Fibre Channel included). + */ + if ( (class != 0x1) && (class != 0xc) ) + continue; + + option_rom_addr = pci_readl(devfn, PCI_ROM_ADDRESS); + if ( !option_rom_addr ) + continue; + + /* Ensure Expansion Bar is enabled before copying */ + pci_writel(devfn, PCI_ROM_ADDRESS, option_rom_addr | 0x1); + + rom_phys_addr += scan_option_rom( + devfn, vendor_id, device_id, + (void *)(option_rom_addr & ~2047), rom_phys_addr); + + /* Restore the default original value of Expansion Bar */ + pci_writel(devfn, PCI_ROM_ADDRESS, option_rom_addr); + } + + return rom_phys_addr - rom_base_addr; } /* Replace possibly erroneous memory-size CMOS fields with correct values. */ @@ -461,8 +538,9 @@ static uint16_t init_xen_platform_io_base(void) int main(void) { - int vgabios_sz = 0, etherboot_sz = 0, rombios_sz, smbios_sz; - uint32_t vga_ram = 0; + int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0; + int rombios_sz, smbios_sz; + uint32_t etherboot_phys_addr, option_rom_phys_addr, vga_ram = 0; uint16_t xen_pfiob; printf("HVM Loader\n"); @@ -497,13 +575,13 @@ int main(void) printf("Loading Cirrus VGABIOS ...\n"); memcpy((void *)VGABIOS_PHYSICAL_ADDRESS, vgabios_cirrusvga, sizeof(vgabios_cirrusvga)); - vgabios_sz = sizeof(vgabios_cirrusvga); + vgabios_sz = round_option_rom(sizeof(vgabios_cirrusvga)); break; case VGA_std: printf("Loading Standard VGABIOS ...\n"); memcpy((void *)VGABIOS_PHYSICAL_ADDRESS, vgabios_stdvga, sizeof(vgabios_stdvga)); - vgabios_sz = sizeof(vgabios_stdvga); + vgabios_sz = round_option_rom(sizeof(vgabios_stdvga)); break; default: printf("No emulated VGA adaptor ...\n"); @@ -516,7 +594,11 @@ int main(void) printf("VGA RAM at %08x\n", vga_ram); } - etherboot_sz = scan_etherboot_nic((void*)ETHERBOOT_PHYSICAL_ADDRESS); + etherboot_phys_addr = VGABIOS_PHYSICAL_ADDRESS + vgabios_sz; + etherboot_sz = scan_etherboot_nic(etherboot_phys_addr); + + option_rom_phys_addr = etherboot_phys_addr + etherboot_sz; + option_rom_sz = pci_load_option_roms(option_rom_phys_addr); if ( get_acpi_enabled() ) { @@ -533,8 +615,12 @@ int main(void) VGABIOS_PHYSICAL_ADDRESS + vgabios_sz - 1); if ( etherboot_sz ) printf(" %05x-%05x: Etherboot ROM\n", - ETHERBOOT_PHYSICAL_ADDRESS, - ETHERBOOT_PHYSICAL_ADDRESS + etherboot_sz - 1); + etherboot_phys_addr, + etherboot_phys_addr + etherboot_sz - 1); + if ( option_rom_sz ) + printf(" %05x-%05x: PCI Option ROMs\n", + option_rom_phys_addr, + option_rom_phys_addr + option_rom_sz - 1); if ( smbios_sz ) printf(" %05x-%05x: SMBIOS tables\n", SMBIOS_PHYSICAL_ADDRESS, diff --git a/tools/firmware/rombios/rombios.c b/tools/firmware/rombios/rombios.c index 0edd371765..05ee9875c6 100644 --- a/tools/firmware/rombios/rombios.c +++ b/tools/firmware/rombios/rombios.c @@ -9677,20 +9677,35 @@ block_count_rounded: pop ds pop ax #endif - xor bx, bx ;; Restore DS back to 0000: - mov ds, bx push ax ;; Save AX push di ;; Save DI ;; Push addr of ROM entry point push cx ;; Push seg push #0x0003 ;; Push offset + ;; Get the BDF into ax before invoking the option ROM + mov bl, [2] + mov al, bl + shr al, #7 + cmp al, #1 + jne fetch_bdf + mov ax, ds ;; Increment the DS since rom size larger than an segment + add ax, #0x1000 + mov ds, ax +fetch_bdf: + shl bx, #9 + xor ax, ax + mov al, [bx] + ;; Point ES:DI at "$PnP", which tells the ROM that we are a PnP BIOS. ;; That should stop it grabbing INT 19h; we will use its BEV instead. - mov ax, #0xf000 - mov es, ax + mov bx, #0xf000 + mov es, bx lea di, pnp_string + xor bx, bx ;; Restore DS back to 0000: + mov ds, bx + mov bp, sp ;; Call ROM init routine using seg:off on stack db 0xff ;; call_far ss:[bp+0] db 0x5e diff --git a/tools/hotplug/Linux/Makefile b/tools/hotplug/Linux/Makefile new file mode 100644 index 0000000000..19ba78ffdb --- /dev/null +++ b/tools/hotplug/Linux/Makefile @@ -0,0 +1,97 @@ +XEN_ROOT = ../../../ +include $(XEN_ROOT)/tools/Rules.mk + +# Init scripts. +XEND_INITD = init.d/xend +XENDOMAINS_INITD = init.d/xendomains +XENDOMAINS_SYSCONFIG = init.d/sysconfig.xendomains + +# Xen configuration dir and configs to go there. +XEN_CONFIG_DIR = /etc/xen + +# Xen script dir and scripts to go there. +XEN_SCRIPT_DIR = /etc/xen/scripts +XEN_SCRIPTS = network-bridge vif-bridge +XEN_SCRIPTS += network-route vif-route +XEN_SCRIPTS += network-nat vif-nat +XEN_SCRIPTS += block +XEN_SCRIPTS += block-enbd block-nbd +XEN_SCRIPTS += blktap +XEN_SCRIPTS += vtpm vtpm-delete +XEN_SCRIPTS += xen-hotplug-cleanup +XEN_SCRIPTS += external-device-migrate +XEN_SCRIPTS += vscsi +XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh +XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh +XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh +XEN_SCRIPT_DATA += vtpm-migration.sh vtpm-impl + +XEN_HOTPLUG_DIR = /etc/hotplug +XEN_HOTPLUG_SCRIPTS = xen-backend.agent + +UDEV_RULES_DIR = /etc/udev +UDEV_RULES = xen-backend.rules + +DI = $(if $(DISTDIR),$(shell readlink -f $(DISTDIR)),) +DE = $(if $(DESTDIR),$(shell readlink -f $(DESTDIR)),) +ifeq ($(findstring $(DI),$(DE)),$(DI)) +HOTPLUGS=install-hotplug install-udev +else +ifeq ($(shell [ -x /usr/bin/udevinfo ] && [ `/usr/bin/udevinfo -V | sed -e 's/^[^0-9]* \([0-9]\{1,\}\)[^0-9]\{0,\}/\1/'` -ge 059 ] && echo 1),1) +HOTPLUGS=install-udev +else +HOTPLUGS=install-hotplug +endif +endif + +.PHONY: all +all: + +.PHONY: build +build: + +.PHONY: install +install: all install-initd install-scripts $(HOTPLUGS) + +.PHONY: install-initd +install-initd: + [ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d + [ -d $(DESTDIR)/etc/sysconfig ] || $(INSTALL_DIR) $(DESTDIR)/etc/sysconfig + $(INSTALL_PROG) $(XEND_INITD) $(DESTDIR)/etc/init.d + $(INSTALL_PROG) $(XENDOMAINS_INITD) $(DESTDIR)/etc/init.d + $(INSTALL_PROG) $(XENDOMAINS_SYSCONFIG) $(DESTDIR)/etc/sysconfig/xendomains + +.PHONY: install-scripts +install-scripts: + [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \ + $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR) + set -e; for i in $(XEN_SCRIPTS); \ + do \ + $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ + done + set -e; for i in $(XEN_SCRIPT_DATA); \ + do \ + $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ + done + +.PHONY: install-hotplug +install-hotplug: + [ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \ + $(INSTALL_DIR) $(DESTDIR)$(XEN_HOTPLUG_DIR) + set -e; for i in $(XEN_HOTPLUG_SCRIPTS); \ + do \ + $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_HOTPLUG_DIR); \ + done + +.PHONY: install-udev +install-udev: + [ -d $(DESTDIR)$(UDEV_RULES_DIR) ] || \ + $(INSTALL_DIR) $(DESTDIR)$(UDEV_RULES_DIR)/rules.d + set -e; for i in $(UDEV_RULES); \ + do \ + $(INSTALL_DATA) $$i $(DESTDIR)$(UDEV_RULES_DIR); \ + ln -sf ../$$i $(DESTDIR)$(UDEV_RULES_DIR)/rules.d; \ + done + +.PHONY: clean +clean: diff --git a/tools/examples/blktap b/tools/hotplug/Linux/blktap index 01a0f6c6da..01a0f6c6da 100644 --- a/tools/examples/blktap +++ b/tools/hotplug/Linux/blktap diff --git a/tools/examples/block b/tools/hotplug/Linux/block index 8c61744c83..8c61744c83 100644 --- a/tools/examples/block +++ b/tools/hotplug/Linux/block diff --git a/tools/examples/block-common.sh b/tools/hotplug/Linux/block-common.sh index a0ebc9b12a..a0ebc9b12a 100644 --- a/tools/examples/block-common.sh +++ b/tools/hotplug/Linux/block-common.sh diff --git a/tools/examples/block-enbd b/tools/hotplug/Linux/block-enbd index 67faa84268..67faa84268 100755..100644 --- a/tools/examples/block-enbd +++ b/tools/hotplug/Linux/block-enbd diff --git a/tools/examples/block-nbd b/tools/hotplug/Linux/block-nbd index b29b31564a..b29b31564a 100644 --- a/tools/examples/block-nbd +++ b/tools/hotplug/Linux/block-nbd diff --git a/tools/examples/external-device-migrate b/tools/hotplug/Linux/external-device-migrate index a4113483a8..a4113483a8 100644 --- a/tools/examples/external-device-migrate +++ b/tools/hotplug/Linux/external-device-migrate diff --git a/tools/examples/init.d/sysconfig.xendomains b/tools/hotplug/Linux/init.d/sysconfig.xendomains index e93b1a40b9..e93b1a40b9 100644 --- a/tools/examples/init.d/sysconfig.xendomains +++ b/tools/hotplug/Linux/init.d/sysconfig.xendomains diff --git a/tools/examples/init.d/xend b/tools/hotplug/Linux/init.d/xend index 4bfc799465..4bfc799465 100755 --- a/tools/examples/init.d/xend +++ b/tools/hotplug/Linux/init.d/xend diff --git a/tools/examples/init.d/xendomains b/tools/hotplug/Linux/init.d/xendomains index 5c2e492f03..5c2e492f03 100644 --- a/tools/examples/init.d/xendomains +++ b/tools/hotplug/Linux/init.d/xendomains diff --git a/tools/examples/locking.sh b/tools/hotplug/Linux/locking.sh index 6ff58e7e6c..6ff58e7e6c 100644 --- a/tools/examples/locking.sh +++ b/tools/hotplug/Linux/locking.sh diff --git a/tools/examples/logging.sh b/tools/hotplug/Linux/logging.sh index c1bc699c7b..c1bc699c7b 100644 --- a/tools/examples/logging.sh +++ b/tools/hotplug/Linux/logging.sh diff --git a/tools/examples/network-bridge b/tools/hotplug/Linux/network-bridge index 9d7be4e2e5..9d7be4e2e5 100755..100644 --- a/tools/examples/network-bridge +++ b/tools/hotplug/Linux/network-bridge diff --git a/tools/examples/network-nat b/tools/hotplug/Linux/network-nat index d9c62c6160..d9c62c6160 100644 --- a/tools/examples/network-nat +++ b/tools/hotplug/Linux/network-nat diff --git a/tools/examples/network-route b/tools/hotplug/Linux/network-route index 574441e334..574441e334 100755..100644 --- a/tools/examples/network-route +++ b/tools/hotplug/Linux/network-route diff --git a/tools/examples/vif-bridge b/tools/hotplug/Linux/vif-bridge index 1b698d703b..1b698d703b 100755..100644 --- a/tools/examples/vif-bridge +++ b/tools/hotplug/Linux/vif-bridge diff --git a/tools/examples/vif-common.sh b/tools/hotplug/Linux/vif-common.sh index ee67ee2aaa..ee67ee2aaa 100644 --- a/tools/examples/vif-common.sh +++ b/tools/hotplug/Linux/vif-common.sh diff --git a/tools/examples/vif-nat b/tools/hotplug/Linux/vif-nat index 75bdf5c444..75bdf5c444 100644 --- a/tools/examples/vif-nat +++ b/tools/hotplug/Linux/vif-nat diff --git a/tools/examples/vif-route b/tools/hotplug/Linux/vif-route index f5fd88ed5a..f5fd88ed5a 100755..100644 --- a/tools/examples/vif-route +++ b/tools/hotplug/Linux/vif-route diff --git a/tools/examples/vscsi b/tools/hotplug/Linux/vscsi index 5ac26147ec..5ac26147ec 100644 --- a/tools/examples/vscsi +++ b/tools/hotplug/Linux/vscsi diff --git a/tools/examples/vtpm b/tools/hotplug/Linux/vtpm index 38a4532fc2..38a4532fc2 100644 --- a/tools/examples/vtpm +++ b/tools/hotplug/Linux/vtpm diff --git a/tools/examples/vtpm-common.sh b/tools/hotplug/Linux/vtpm-common.sh index a45868eefd..a45868eefd 100644 --- a/tools/examples/vtpm-common.sh +++ b/tools/hotplug/Linux/vtpm-common.sh diff --git a/tools/examples/vtpm-delete b/tools/hotplug/Linux/vtpm-delete index b75b95bf0a..b75b95bf0a 100644 --- a/tools/examples/vtpm-delete +++ b/tools/hotplug/Linux/vtpm-delete diff --git a/tools/examples/vtpm-hotplug-common.sh b/tools/hotplug/Linux/vtpm-hotplug-common.sh index 9fd35e7402..9fd35e7402 100644 --- a/tools/examples/vtpm-hotplug-common.sh +++ b/tools/hotplug/Linux/vtpm-hotplug-common.sh diff --git a/tools/examples/vtpm-impl b/tools/hotplug/Linux/vtpm-impl index 4f9a1fd85e..4f9a1fd85e 100644 --- a/tools/examples/vtpm-impl +++ b/tools/hotplug/Linux/vtpm-impl diff --git a/tools/examples/vtpm-migration.sh b/tools/hotplug/Linux/vtpm-migration.sh index 7e38ae26f0..7e38ae26f0 100644 --- a/tools/examples/vtpm-migration.sh +++ b/tools/hotplug/Linux/vtpm-migration.sh diff --git a/tools/examples/xen-backend.agent b/tools/hotplug/Linux/xen-backend.agent index 5cb536a6a9..5cb536a6a9 100755..100644 --- a/tools/examples/xen-backend.agent +++ b/tools/hotplug/Linux/xen-backend.agent diff --git a/tools/examples/xen-backend.rules b/tools/hotplug/Linux/xen-backend.rules index fe21fc1357..fe21fc1357 100644 --- a/tools/examples/xen-backend.rules +++ b/tools/hotplug/Linux/xen-backend.rules diff --git a/tools/examples/xen-hotplug-cleanup b/tools/hotplug/Linux/xen-hotplug-cleanup index f7337e45bf..f7337e45bf 100644 --- a/tools/examples/xen-hotplug-cleanup +++ b/tools/hotplug/Linux/xen-hotplug-cleanup diff --git a/tools/examples/xen-hotplug-common.sh b/tools/hotplug/Linux/xen-hotplug-common.sh index 980a62704e..980a62704e 100644 --- a/tools/examples/xen-hotplug-common.sh +++ b/tools/hotplug/Linux/xen-hotplug-common.sh diff --git a/tools/examples/xen-network-common.sh b/tools/hotplug/Linux/xen-network-common.sh index 7014333df0..7014333df0 100644 --- a/tools/examples/xen-network-common.sh +++ b/tools/hotplug/Linux/xen-network-common.sh diff --git a/tools/examples/xen-script-common.sh b/tools/hotplug/Linux/xen-script-common.sh index f6841acffa..f6841acffa 100644 --- a/tools/examples/xen-script-common.sh +++ b/tools/hotplug/Linux/xen-script-common.sh diff --git a/tools/hotplug/Makefile b/tools/hotplug/Makefile new file mode 100644 index 0000000000..979e916d7f --- /dev/null +++ b/tools/hotplug/Makefile @@ -0,0 +1,9 @@ +XEN_ROOT = ../../ +include $(XEN_ROOT)/tools/Rules.mk + +SUBDIRS-y := common +SUBDIRS-$(CONFIG_NetBSD) += NetBSD +SUBDIRS-$(CONFIG_Linux) += Linux + +.PHONY: all clean install +all clean install: %: subdirs-% diff --git a/tools/hotplug/NetBSD/Makefile b/tools/hotplug/NetBSD/Makefile new file mode 100644 index 0000000000..1d369eaf9a --- /dev/null +++ b/tools/hotplug/NetBSD/Makefile @@ -0,0 +1,41 @@ +XEN_ROOT = ../../../ +include $(XEN_ROOT)/tools/Rules.mk + +# Xen configuration dir and configs to go there. +XEN_CONFIG_DIR = $(PREFIX)/etc/xen + +# Xen script dir and scripts to go there. +XEN_SCRIPT_DIR = $(PREFIX)/etc/xen/scripts +XEN_SCRIPTS = +XEN_SCRIPTS += block-nbsd +XEN_SCRIPTS += hvm-nbsd +XEN_SCRIPTS += netbsd1-nbsd +XEN_SCRIPTS += qemu-ifup-nbsd +XEN_SCRIPTS += vif-bridge-nbsd +XEN_SCRIPTS += vif-ip-nbsd + +XEN_SCRIPT_DATA = + +.PHONY: all +all: + +.PHONY: build +build: + +.PHONY: install +install: all install-scripts + +.PHONY: install-scripts +install-scripts: + $(INSTALL_DATA_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR) + set -e; for i in $(XEN_SCRIPTS); \ + do \ + $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ + done + set -e; for i in $(XEN_SCRIPT_DATA); \ + do \ + $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ + done + +.PHONY: clean +clean: diff --git a/tools/hotplug/NetBSD/block-nbsd b/tools/hotplug/NetBSD/block-nbsd new file mode 100644 index 0000000000..915ddb755a --- /dev/null +++ b/tools/hotplug/NetBSD/block-nbsd @@ -0,0 +1,88 @@ +#!/bin/sh -e + +# $NetBSD: block-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $ +# Called by xenbackendd +# Usage: block xsdir_backend_path state + +PATH=/bin:/usr/bin:/sbin:/usr/sbin +export PATH + +error() { + echo "$@" >&2 + xenstore_write $xpath/hotplug-status error + exit 1 +} + + +xpath=$1 +xstatus=$2 +xtype=$(xenstore-read "$xpath/type") +xparams=$(xenstore-read "$xpath/params") + +case $xstatus in +6) + # device removed + case $xtype in + file) + vnd=$(xenstore-read "$xpath/vnd" || echo none) + if [ $vnd != none ]; then + vnconfig -u $vnd + fi + ;; + phy) + ;; + *) + echo "unknown type $xtype" >&2 + ;; + esac + xenstore-rm $xpath + exit 0 + ;; +2) + case $xtype in + file) + # Store the list of available vnd(4) devices in + #``available_disks'', and mark them as ``free''. + list=`ls -1 /dev/vnd[0-9]*d | sed "s,/dev/vnd,,;s,d,," | sort -n` + for i in $list; do + disk="vnd$i" + available_disks="$available_disks $disk" + eval $disk=free + done + # Mark the used vnd(4) devices as ``used''. + for disk in `sysctl hw.disknames`; do + case $disk in + vnd[0-9]*) eval $disk=used ;; + esac + done + # Configure the first free vnd(4) device. + for disk in $available_disks; do + eval status=\$$disk + if [ "$status" = "free" ] && \ + vnconfig /dev/${disk}d $xparams >/dev/null; then + device=/dev/${disk}d + echo vnconfig /dev/${disk}d $xparams + break + fi + done + if [ x$device = x ] ; then + error "no available vnd device" + fi + echo xenstore-write $xpath/vnd $device + xenstore-write $xpath/vnd $device + ;; + phy) + device=$xparams + ;; + esac + physical_device=$(stat -f '%r' "$device") + echo xenstore-write $xpath/physical-device $physical_device + xenstore-write $xpath/physical-device $physical_device + echo xenstore-write $xpath/hotplug-status connected + xenstore-write $xpath/hotplug-status connected + exit 0 + ;; +*) + exit 0 + ;; +esac diff --git a/tools/hotplug/NetBSD/qemu-ifup-nbsd b/tools/hotplug/NetBSD/qemu-ifup-nbsd new file mode 100644 index 0000000000..eee78765d6 --- /dev/null +++ b/tools/hotplug/NetBSD/qemu-ifup-nbsd @@ -0,0 +1,3 @@ +#!/bin/sh +ifconfig $1 up +exec /sbin/brconfig $2 add $1 diff --git a/tools/hotplug/NetBSD/vif-bridge-nbsd b/tools/hotplug/NetBSD/vif-bridge-nbsd new file mode 100644 index 0000000000..bedb387953 --- /dev/null +++ b/tools/hotplug/NetBSD/vif-bridge-nbsd @@ -0,0 +1,35 @@ +#!/bin/sh -e + +# $NetBSD: vif-bridge-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $ +# Called by xenbackendd +# Usage: vif-bridge xsdir_backend_path state + +PATH=/bin:/usr/bin:/sbin:/usr/sbin +export PATH + +xpath=$1 +xstatus=$2 + +case $xstatus in +6) + # device removed + xenstore-rm $xpath + exit 0 + ;; +2) + xbridge=$(xenstore-read "$xpath/bridge") + xfid=$(xenstore-read "$xpath/frontend-id") + xhandle=$(xenstore-read "$xpath/handle") + iface=xvif$xfid.$xhandle + echo ifconfig $iface up + ifconfig $iface up + brconfig $xbridge add $iface + echo brconfig $xbridge add $iface + xenstore-write $xpath/hotplug-status connected + echo xenstore-write $xpath/hotplug-status connected + exit 0 + ;; +*) + exit 0 + ;; +esac diff --git a/tools/hotplug/NetBSD/vif-ip-nbsd b/tools/hotplug/NetBSD/vif-ip-nbsd new file mode 100644 index 0000000000..d8b5bb9759 --- /dev/null +++ b/tools/hotplug/NetBSD/vif-ip-nbsd @@ -0,0 +1,33 @@ +#!/bin/sh -e + +# $NetBSD: vif-ip-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $ +# Called by xenbackendd +# Usage: vif-ip xsdir_backend_path state + +PATH=/bin:/usr/bin:/sbin:/usr/sbin +export PATH + +xpath=$1 +xstatus=$2 + +case $xstatus in +6) + # device removed + xenstore-rm $xpath + exit 0 + ;; +2) + xip=$(xenstore-read "$xpath/ip") + xfid=$(xenstore-read "$xpath/frontend-id") + xhandle=$(xenstore-read "$xpath/handle") + iface=xvif$xfid.$xhandle + echo ifconfig $iface $xip up + ifconfig $iface $xip up + xenstore-write $xpath/hotplug-status connected + echo xenstore-write $xpath/hotplug-status connected + exit 0 + ;; +*) + exit 0 + ;; +esac diff --git a/tools/hotplug/common/Makefile b/tools/hotplug/common/Makefile new file mode 100644 index 0000000000..b69b9991af --- /dev/null +++ b/tools/hotplug/common/Makefile @@ -0,0 +1,37 @@ +XEN_ROOT = ../../../ +include $(XEN_ROOT)/tools/Rules.mk + +# OS-independent hotplug scripts go in this directory + +# Xen configuration dir and configs to go there. +XEN_CONFIG_DIR = /etc/xen + +# Xen script dir and scripts to go there. +XEN_SCRIPT_DIR = /etc/xen/scripts +XEN_SCRIPTS = +XEN_SCRIPT_DATA = + +.PHONY: all +all: + +.PHONY: build +build: + +.PHONY: install +install: all install-scripts + +.PHONY: install-scripts +install-scripts: + [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \ + $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR) + set -e; for i in $(XEN_SCRIPTS); \ + do \ + $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ + done + set -e; for i in $(XEN_SCRIPT_DATA); \ + do \ + $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \ + done + +.PHONY: clean +clean: diff --git a/tools/libxc/xc_cpufeature.h b/tools/libxc/xc_cpufeature.h index 6cd442cfe6..047a6c9fc7 100644 --- a/tools/libxc/xc_cpufeature.h +++ b/tools/libxc/xc_cpufeature.h @@ -83,6 +83,7 @@ #define X86_FEATURE_SSE4_1 (4*32+19) /* Streaming SIMD Extensions 4.1 */ #define X86_FEATURE_SSE4_2 (4*32+20) /* Streaming SIMD Extensions 4.2 */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */ diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c index 6a8e7594c8..d75fe2b013 100644 --- a/tools/libxc/xc_cpuid_x86.c +++ b/tools/libxc/xc_cpuid_x86.c @@ -194,6 +194,8 @@ static void xc_cpuid_hvm_policy( bitmaskof(X86_FEATURE_SSE4_2) | bitmaskof(X86_FEATURE_POPCNT)); + regs[2] |= bitmaskof(X86_FEATURE_HYPERVISOR); + regs[3] &= (bitmaskof(X86_FEATURE_FPU) | bitmaskof(X86_FEATURE_VME) | bitmaskof(X86_FEATURE_DE) | @@ -309,6 +311,7 @@ static void xc_cpuid_pv_policy( clear_bit(X86_FEATURE_XTPR, regs[2]); clear_bit(X86_FEATURE_PDCM, regs[2]); clear_bit(X86_FEATURE_DCA, regs[2]); + set_bit(X86_FEATURE_HYPERVISOR, regs[2]); break; case 0x80000001: if ( !guest_64bit ) diff --git a/tools/misc/xenpm.c b/tools/misc/xenpm.c index 618aa27a84..ace72b4216 100644 --- a/tools/misc/xenpm.c +++ b/tools/misc/xenpm.c @@ -170,7 +170,7 @@ int main(int argc, char **argv) if ( !pxstat->pt ) { fprintf(stderr, "failed to malloc for P-states table\n"); - free(pxstat->pt); + free(pxstat->trans_pt); break; } diff --git a/tools/python/xen/util/rwlock.py b/tools/python/xen/util/rwlock.py new file mode 100644 index 0000000000..e79a82f8e8 --- /dev/null +++ b/tools/python/xen/util/rwlock.py @@ -0,0 +1,137 @@ +""" Reader-writer lock implementation based on a condition variable """ + +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2008 International Business Machines Corp. +# Author: Stefan Berger <stefanb@us.ibm.com> +#============================================================================ + +from threading import Condition + +class RWLock: + + RWLOCK_STATE_WRITER = -1 + RWLOCK_STATE_UNUSED = 0 + + def __init__(self): + self.__condition = Condition() + self.__state = RWLock.RWLOCK_STATE_UNUSED + self.__blocked_writers = 0 + + def acquire_reader(self): + self.__condition.acquire() + while True: + if self.__state == RWLock.RWLOCK_STATE_WRITER: + self.__condition.wait() + else: + break + self.__state += 1 + self.__condition.release() + + def acquire_writer(self): + self.__condition.acquire() + self.__acquire_writer(RWLock.RWLOCK_STATE_UNUSED) + self.__condition.release() + + def __acquire_writer(self, wait_for_state): + while True: + if self.__state == wait_for_state: + self.__state = RWLock.RWLOCK_STATE_WRITER + break + else: + self.__blocked_writers += 1 + self.__condition.wait() + self.__blocked_writers -= 1 + + def release(self): + self.__condition.acquire() + if self.__state == RWLock.RWLOCK_STATE_WRITER: + self.__state = RWLock.RWLOCK_STATE_UNUSED + elif self.__state == RWLock.RWLOCK_STATE_UNUSED: + assert False, 'Lock not in use.' + else: + self.__state -= 1 + self.__condition.notifyAll() + self.__condition.release() + + +if __name__ == '__main__': + from threading import Thread + from time import sleep + + rwlock = RWLock() + + class Base(Thread): + def __init__(self, name, timeout): + self.name = name + self.timeout = timeout + Thread.__init__(self) + + class Reader(Base): + def __init__(self, name = 'Reader', timeout = 10): + Base.__init__(self, name, timeout) + + def run(self): + print '%s begin' % self.name + rwlock.acquire_reader() + print '%s acquired' % self.name + sleep(self.timeout) + rwlock.release() + print '%s end' % self.name + + class ReaderTwice(Base): + def __init__(self, name = 'Reader', timeout = 10): + Base.__init__(self, name, timeout) + + def run(self): + print '%s begin' % self.name + rwlock.acquire_reader() + print '%s acquired once' % self.name + sleep(self.timeout) + rwlock.acquire_reader() + print '%s acquired twice' % self.name + sleep(self.timeout) + rwlock.release() + rwlock.release() + print '%s end' % self.name + + class Writer(Base): + def __init__(self, name = 'Writer', timeout = 10): + Base.__init__(self, name, timeout) + + def run(self): + print '%s begin' % self.name + rwlock.acquire_writer() + print '%s acquired' % self.name + sleep(self.timeout) + rwlock.release() + print '%s end' % self.name + + def run_test(threadlist, msg): + print msg + for t in threadlist: + t.start() + sleep(1) + for t in threads: + t.join() + print 'Done\n\n' + + threads = [] + threads.append( Reader('R1', 4) ) + threads.append( Reader('R2', 4) ) + threads.append( Writer('W1', 4) ) + threads.append( Reader('R3', 4) ) + run_test(threads, + 'Test: readers may bypass blocked writers') diff --git a/tools/python/xen/xend/XendAPI.py b/tools/python/xen/xend/XendAPI.py index 42e131be37..71aac01f90 100644 --- a/tools/python/xen/xend/XendAPI.py +++ b/tools/python/xen/xend/XendAPI.py @@ -431,7 +431,7 @@ def valid_object(class_name): lambda *args, **kwargs: \ _check_ref(lambda r: \ XendAPIStore.get(r, class_name) is not None, - 'PIF', func, *args, **kwargs) + class_name, func, *args, **kwargs) # ----------------------------- # Bridge to Legacy XM API calls diff --git a/tools/python/xen/xend/XendConfig.py b/tools/python/xen/xend/XendConfig.py index eb868e7b29..0773fa5e57 100644 --- a/tools/python/xen/xend/XendConfig.py +++ b/tools/python/xen/xend/XendConfig.py @@ -1032,8 +1032,6 @@ class XendConfig(dict): sxpr.append([name, s]) for xenapi, legacy in XENAPI_CFG_TO_LEGACY_CFG.items(): - if legacy in ('cpus'): # skip this - continue if self.has_key(xenapi) and self[xenapi] not in (None, []): if type(self[xenapi]) == bool: # convert booleans to ints before making an sxp item diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/XendDomain.py index 9faebe95aa..d5195c421e 100644 --- a/tools/python/xen/xend/XendDomain.py +++ b/tools/python/xen/xend/XendDomain.py @@ -50,7 +50,7 @@ from xen.xend.XendAPIConstants import * from xen.xend.xenstore.xstransact import xstransact from xen.xend.xenstore.xswatch import xswatch -from xen.util import mkdir +from xen.util import mkdir, rwlock from xen.xend import uuid xc = xen.lowlevel.xc.xc() @@ -93,6 +93,8 @@ class XendDomain: self.managed_domains = {} self.domains_lock = threading.RLock() + self.policy_lock = rwlock.RWLock() + # xen api instance vars # TODO: nothing uses this at the moment self._allow_new_domains = True @@ -1139,16 +1141,21 @@ class XendDomain: """ try: - return XendCheckpoint.restore(self, fd, paused=paused, relocating=relocating) - except XendError, e: - log.exception("Restore failed") - raise - except: - # I don't really want to log this exception here, but the error - # handling in the relocation-socket handling code (relocate.py) is - # poor, so we need to log this for debugging. - log.exception("Restore failed") - raise XendError("Restore failed") + self.policy_lock.acquire_reader() + + try: + return XendCheckpoint.restore(self, fd, paused=paused, relocating=relocating) + except XendError, e: + log.exception("Restore failed") + raise + except: + # I don't really want to log this exception here, but the error + # handling in the relocation-socket handling code (relocate.py) is + # poor, so we need to log this for debugging. + log.exception("Restore failed") + raise XendError("Restore failed") + finally: + self.policy_lock.release() def domain_unpause(self, domid): """Unpause domain execution. diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py index d0ade8c858..bab9d3aef2 100644 --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -479,6 +479,14 @@ class XendDomainInfo: if state in (DOM_STATE_SUSPENDED, DOM_STATE_HALTED): try: self._constructDomain() + + try: + self._setCPUAffinity() + except: + # usually a CPU we want to set affinity to does not exist + # we just ignore it so that the domain can still be restored + log.warn("Cannot restore CPU affinity") + self._storeVmDetails() self._createChannels() self._createDevices() @@ -2166,6 +2174,64 @@ class XendDomainInfo: raise XendError(str(exn)) + def _setCPUAffinity(self): + """ Repin domain vcpus if a restricted cpus list is provided + """ + + def has_cpus(): + if self.info['cpus'] is not None: + for c in self.info['cpus']: + if c: + return True + return False + + if has_cpus(): + for v in range(0, self.info['VCPUs_max']): + if self.info['cpus'][v]: + xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v]) + else: + def find_relaxed_node(node_list): + import sys + nr_nodes = info['nr_nodes'] + if node_list is None: + node_list = range(0, nr_nodes) + nodeload = [0] + nodeload = nodeload * nr_nodes + from xen.xend import XendDomain + doms = XendDomain.instance().list('all') + for dom in filter (lambda d: d.domid != self.domid, doms): + cpuinfo = dom.getVCPUInfo() + for vcpu in sxp.children(cpuinfo, 'vcpu'): + if sxp.child_value(vcpu, 'online') == 0: continue + cpumap = list(sxp.child_value(vcpu,'cpumap')) + for i in range(0, nr_nodes): + node_cpumask = info['node_to_cpu'][i] + for j in node_cpumask: + if j in cpumap: + nodeload[i] += 1 + break + for i in range(0, nr_nodes): + if len(info['node_to_cpu'][i]) > 0 and i in node_list: + nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i])) + else: + nodeload[i] = sys.maxint + index = nodeload.index( min(nodeload) ) + return index + + info = xc.physinfo() + if info['nr_nodes'] > 1: + node_memory_list = info['node_to_memory'] + needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024 + candidate_node_list = [] + for i in range(0, info['nr_nodes']): + if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0: + candidate_node_list.append(i) + index = find_relaxed_node(candidate_node_list) + cpumask = info['node_to_cpu'][index] + for v in range(0, self.info['VCPUs_max']): + xc.vcpu_setaffinity(self.domid, v, cpumask) + + def _initDomain(self): log.debug('XendDomainInfo.initDomain: %s %s', self.domid, @@ -2185,58 +2251,7 @@ class XendDomainInfo: # repin domain vcpus if a restricted cpus list is provided # this is done prior to memory allocation to aide in memory # distribution for NUMA systems. - def has_cpus(): - if self.info['cpus'] is not None: - for c in self.info['cpus']: - if c: - return True - return False - - if has_cpus(): - for v in range(0, self.info['VCPUs_max']): - if self.info['cpus'][v]: - xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v]) - else: - def find_relaxed_node(node_list): - import sys - nr_nodes = info['nr_nodes'] - if node_list is None: - node_list = range(0, nr_nodes) - nodeload = [0] - nodeload = nodeload * nr_nodes - from xen.xend import XendDomain - doms = XendDomain.instance().list('all') - for dom in filter (lambda d: d.domid != self.domid, doms): - cpuinfo = dom.getVCPUInfo() - for vcpu in sxp.children(cpuinfo, 'vcpu'): - if sxp.child_value(vcpu, 'online') == 0: continue - cpumap = list(sxp.child_value(vcpu,'cpumap')) - for i in range(0, nr_nodes): - node_cpumask = info['node_to_cpu'][i] - for j in node_cpumask: - if j in cpumap: - nodeload[i] += 1 - break - for i in range(0, nr_nodes): - if len(info['node_to_cpu'][i]) > 0 and i in node_list: - nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i])) - else: - nodeload[i] = sys.maxint - index = nodeload.index( min(nodeload) ) - return index - - info = xc.physinfo() - if info['nr_nodes'] > 1: - node_memory_list = info['node_to_memory'] - needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024 - candidate_node_list = [] - for i in range(0, info['nr_nodes']): - if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0: - candidate_node_list.append(i) - index = find_relaxed_node(candidate_node_list) - cpumask = info['node_to_cpu'][index] - for v in range(0, self.info['VCPUs_max']): - xc.vcpu_setaffinity(self.domid, v, cpumask) + self._setCPUAffinity() # Use architecture- and image-specific calculations to determine # the various headrooms necessary, given the raw configured @@ -3011,64 +3026,69 @@ class XendDomainInfo: if not xspol: xspol = poladmin.get_policy_by_name(policy) - if state in [ DOM_STATE_RUNNING, DOM_STATE_PAUSED ]: - #if domain is running or paused try to relabel in hypervisor - if not xspol: - return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0) - - if typ != xspol.get_type_name() or \ - policy != xspol.get_name(): - return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) + try: + xen.xend.XendDomain.instance().policy_lock.acquire_writer() - if typ == xsconstants.ACM_POLICY_ID: - new_ssidref = xspol.vmlabel_to_ssidref(label) - if new_ssidref == xsconstants.INVALID_SSIDREF: - return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) + if state in [ DOM_STATE_RUNNING, DOM_STATE_PAUSED ]: + #if domain is running or paused try to relabel in hypervisor + if not xspol: + return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0) - # Check that all used resources are accessible under the - # new label - if not is_policy_update and \ - not security.resources_compatible_with_vmlabel(xspol, - self, label): + if typ != xspol.get_type_name() or \ + policy != xspol.get_name(): return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) - #Check label against expected one. Can only do this - # if the policy hasn't changed underneath in the meantime - if xspol_old == None: - old_label = self.get_security_label() - if old_label != old_seclab: - log.info("old_label != old_seclab: %s != %s" % - (old_label, old_seclab)) + if typ == xsconstants.ACM_POLICY_ID: + new_ssidref = xspol.vmlabel_to_ssidref(label) + if new_ssidref == xsconstants.INVALID_SSIDREF: return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) - # relabel domain in the hypervisor - rc, errors = security.relabel_domains([[domid, new_ssidref]]) - log.info("rc from relabeling in HV: %d" % rc) - else: - return (-xsconstants.XSERR_POLICY_TYPE_UNSUPPORTED, "", "", 0) + # Check that all used resources are accessible under the + # new label + if not is_policy_update and \ + not security.resources_compatible_with_vmlabel(xspol, + self, label): + return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) - if rc == 0: - # HALTED, RUNNING or PAUSED - if domid == 0: - if xspol: - self.info['security_label'] = seclab - ssidref = poladmin.set_domain0_bootlabel(xspol, label) + #Check label against expected one. Can only do this + # if the policy hasn't changed underneath in the meantime + if xspol_old == None: + old_label = self.get_security_label() + if old_label != old_seclab: + log.info("old_label != old_seclab: %s != %s" % + (old_label, old_seclab)) + return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) + + # relabel domain in the hypervisor + rc, errors = security.relabel_domains([[domid, new_ssidref]]) + log.info("rc from relabeling in HV: %d" % rc) else: - return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0) - else: - if self.info.has_key('security_label'): - old_label = self.info['security_label'] - # Check label against expected one, unless wildcard - if old_label != old_seclab: - return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) + return (-xsconstants.XSERR_POLICY_TYPE_UNSUPPORTED, "", "", 0) + + if rc == 0: + # HALTED, RUNNING or PAUSED + if domid == 0: + if xspol: + self.info['security_label'] = seclab + ssidref = poladmin.set_domain0_bootlabel(xspol, label) + else: + return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0) + else: + if self.info.has_key('security_label'): + old_label = self.info['security_label'] + # Check label against expected one, unless wildcard + if old_label != old_seclab: + return (-xsconstants.XSERR_BAD_LABEL, "", "", 0) - self.info['security_label'] = seclab + self.info['security_label'] = seclab - try: - xen.xend.XendDomain.instance().managed_config_save(self) - except: - pass - return (rc, errors, old_label, new_ssidref) + try: + xen.xend.XendDomain.instance().managed_config_save(self) + except: + pass + return (rc, errors, old_label, new_ssidref) + finally: + xen.xend.XendDomain.instance().policy_lock.release() def get_on_shutdown(self): after_shutdown = self.info.get('actions_after_shutdown') diff --git a/tools/python/xen/xend/osdep.py b/tools/python/xen/xend/osdep.py index a026c85277..6636797b44 100644 --- a/tools/python/xen/xend/osdep.py +++ b/tools/python/xen/xend/osdep.py @@ -38,7 +38,10 @@ _vif_script = { "SunOS": "vif-vnic" } -def _linux_balloon_stat(label): +PROC_XEN_BALLOON = '/proc/xen/balloon' +SYSFS_XEN_MEMORY = '/sys/devices/system/xen_memory/xen_memory0' + +def _linux_balloon_stat_proc(label): """Returns the value for the named label, or None if an error occurs.""" xend2linux_labels = { 'current' : 'Current allocation', @@ -47,7 +50,6 @@ def _linux_balloon_stat(label): 'high-balloon' : 'High-mem balloon', 'limit' : 'Xen hard limit' } - PROC_XEN_BALLOON = '/proc/xen/balloon' f = file(PROC_XEN_BALLOON, 'r') try: for line in f: @@ -62,6 +64,29 @@ def _linux_balloon_stat(label): finally: f.close() +def _linux_balloon_stat_sysfs(label): + sysfiles = { 'target' : 'target_kb', + 'current' : 'info/current_kb', + 'low-balloon' : 'info/low_kb', + 'high-balloon' : 'info/high_kb', + 'limit' : 'info/hard_limit_kb' } + + name = os.path.join(SYSFS_XEN_MEMORY, sysfiles[label]) + f = file(name, 'r') + + val = f.read().strip() + if val.isdigit(): + return int(val) + return None + +def _linux_balloon_stat(label): + if os.access(PROC_XEN_BALLOON, os.F_OK): + return _linux_balloon_stat_proc(label) + elif os.access(SYSFS_XEN_MEMORY, os.F_OK): + return _linux_balloon_stat_sysfs(label) + + return None + def _solaris_balloon_stat(label): """Returns the value for the named label, or None if an error occurs.""" diff --git a/tools/python/xen/xend/server/pciif.py b/tools/python/xen/xend/server/pciif.py index d8df297f80..1051450a08 100644 --- a/tools/python/xen/xend/server/pciif.py +++ b/tools/python/xen/xend/server/pciif.py @@ -35,6 +35,8 @@ import resource import re from xen.xend.server.pciquirk import * +from xen.xend.xenstore.xstransact import xstransact +from xen.xend.xenstore.xswatch import xswatch xc = xen.lowlevel.xc.xc() @@ -58,6 +60,7 @@ def parse_hex(val): class PciController(DevController): def __init__(self, vm): + self.aerStateWatch = None DevController.__init__(self, vm) @@ -333,12 +336,6 @@ class PciController(DevController): if rc<0: raise VmError(('pci: failed to configure I/O memory on device '+ '%s - errno=%d')%(dev.name,rc)) - rc = xc.physdev_map_pirq(domid = fe_domid, - index = dev.irq, - pirq = dev.irq) - if rc < 0: - raise VmError(('pci: failed to map irq on device '+ - '%s - errno=%d')%(dev.name,rc)) if dev.msix: for (start, size) in dev.msix_iomem: @@ -353,6 +350,12 @@ class PciController(DevController): if rc<0: raise VmError(('pci: failed to remove msi-x iomem')) + rc = xc.physdev_map_pirq(domid = fe_domid, + index = dev.irq, + pirq = dev.irq) + if rc < 0: + raise VmError(('pci: failed to map irq on device '+ + '%s - errno=%d')%(dev.name,rc)) if dev.irq>0: log.debug('pci: enabling irq %d'%dev.irq) rc = xc.domain_irq_permission(domid = fe_domid, pirq = dev.irq, @@ -431,9 +434,23 @@ class PciController(DevController): for (domain, bus, slot, func) in pci_dev_list: self.setupOneDevice(domain, bus, slot, func) - + wPath = '/local/domain/0/backend/pci/%u/0/aerState' % (self.getDomid()) + self.aerStatePath = xswatch(wPath, self._handleAerStateWatch) + log.debug('pci: register aer watch %s', wPath) return + def _handleAerStateWatch(self, _): + log.debug('XendDomainInfo.handleAerStateWatch') + if self.getDomid() == 0: + raise XendError('Domain 0 cannot be shutdown') + readPath = '/local/domain/0/backend/pci/%u/0/aerState' % (self.getDomid()) + action = xstransact.Read(readPath) + if action and action=='aerfail': + log.debug('shutdown domain because of aer handle error') + self.vm.shutdown('poweroff') + return True + + def cleanupOneDevice(self, domain, bus, slot, func): """ Detach I/O resources for device from frontend domain """ @@ -545,6 +562,22 @@ class PciController(DevController): return new_num_devs + def destroyDevice(self, devid, force): + DevController.destroyDevice(self, devid, True) + log.debug('pci: unregister aer watch') + self.unwatchAerState + + def unwatchAerState(self): + """Remove the watch on the domain's aerState node, if any.""" + try: + try: + if self.aerStateWatch: + self.aerStateWatch.unwatch() + finally: + self.aerStateWatch = None + except: + log.exception("Unwatching aerState failed.") + def waitForBackend(self,devid): return (0, "ok - no hotplug") diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py index eb8f3e237c..3620e4968d 100644 --- a/tools/python/xen/xm/create.py +++ b/tools/python/xen/xm/create.py @@ -1202,8 +1202,9 @@ def make_domain(opts, config): except: server.xend.domain.destroy(dom) err("Failed to unpause domain %s" % dom) - opts.info("Started domain %s" % (dom)) - return int(sxp.child_value(dominfo, 'domid')) + domid = int(sxp.child_value(dominfo, 'domid')) + opts.info("Started domain %s (id=%d)" % (dom, domid)) + return domid def get_xauthority(): diff --git a/unmodified_drivers/linux-2.6/balloon/Kbuild b/unmodified_drivers/linux-2.6/balloon/Kbuild index bcc8b05207..316592d83a 100644 --- a/unmodified_drivers/linux-2.6/balloon/Kbuild +++ b/unmodified_drivers/linux-2.6/balloon/Kbuild @@ -4,6 +4,5 @@ obj-m = xen-balloon.o EXTRA_CFLAGS += -I$(M)/platform-pci -xen-balloon-objs = -xen-balloon-objs += balloon.o -xen-balloon-objs += sysfs.o +xen-balloon-y := balloon.o sysfs.o +xen-balloon-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o diff --git a/unmodified_drivers/linux-2.6/mkbuildtree b/unmodified_drivers/linux-2.6/mkbuildtree index 9d0f04907e..3c1c799c85 100755 --- a/unmodified_drivers/linux-2.6/mkbuildtree +++ b/unmodified_drivers/linux-2.6/mkbuildtree @@ -53,6 +53,7 @@ i[34567]86|x86_64) ln -sf ${XL}/include/asm-x86/mach-xen/asm/synch_bitops*.h include/asm ln -sf ${XL}/include/asm-x86/mach-xen/asm/maddr*.h include/asm ln -sf ${XL}/include/asm-x86/mach-xen/asm/gnttab_dma.h include/asm + ln -sf ${XL}/arch/x86/lib/scrub.c balloon else if [ $uname = x86_64 ]; then mkdir -p include/asm-i386 diff --git a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c index 08bf645ef7..ad667128a2 100644 --- a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c +++ b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c @@ -34,7 +34,11 @@ static void ap_suspend(void *_info) atomic_dec(&info->nr_spinning); } +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) #define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0, 0) +#else +#define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0) +#endif #else /* !defined(CONFIG_SMP) */ diff --git a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c index 2b35c5c757..e4a766a909 100644 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c @@ -14,7 +14,11 @@ EXPORT_SYMBOL(system_state); void ctrl_alt_del(void) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) kill_proc(1, SIGINT, 1); /* interrupt init */ +#else + kill_cad_pid(SIGINT, 1); +#endif } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) diff --git a/xen/Rules.mk b/xen/Rules.mk index 36292c92e5..43a4f63249 100644 --- a/xen/Rules.mk +++ b/xen/Rules.mk @@ -69,6 +69,9 @@ CFLAGS-$(frame_pointer) += -fno-omit-frame-pointer -DCONFIG_FRAME_POINTER ifneq ($(max_phys_cpus),) CFLAGS-y += -DMAX_PHYS_CPUS=$(max_phys_cpus) endif +ifneq ($(max_phys_irqs),) +CFLAGS-y += -DMAX_PHYS_IRQS=$(max_phys_irqs) +endif AFLAGS-y += -D__ASSEMBLY__ diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index eb9dd08f47..60ec345073 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64 obj-y += apic.o obj-y += bitops.o obj-y += clear_page.o +obj-y += copy_page.o obj-y += compat.o obj-y += delay.o obj-y += dmi_scan.o diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c index 055e3d7b51..2ae8cc4157 100644 --- a/xen/arch/x86/acpi/boot.c +++ b/xen/arch/x86/acpi/boot.c @@ -601,7 +601,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, - NR_IRQ_VECTORS); + MAX_IRQ_SOURCES); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -623,7 +623,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, - NR_IRQ_VECTORS); + MAX_IRQ_SOURCES); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c index 7e96bfc796..ccad2ea5ee 100644 --- a/xen/arch/x86/acpi/power.c +++ b/xen/arch/x86/acpi/power.c @@ -30,6 +30,8 @@ #include <acpi/cpufreq/cpufreq.h> +uint32_t system_reset_counter = 1; + static char opt_acpi_sleep[20]; string_param("acpi_sleep", opt_acpi_sleep); @@ -75,19 +77,47 @@ static void device_power_up(void) static void freeze_domains(void) { struct domain *d; + struct vcpu *v; + rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) - if ( d->domain_id != 0 ) + { + switch ( d->domain_id ) + { + case 0: + for_each_vcpu ( d, v ) + if ( v != current ) + vcpu_pause(v); + break; + default: domain_pause(d); + break; + } + } + rcu_read_unlock(&domlist_read_lock); } static void thaw_domains(void) { struct domain *d; + struct vcpu *v; + rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) - if ( d->domain_id != 0 ) + { + switch ( d->domain_id ) + { + case 0: + for_each_vcpu ( d, v ) + if ( v != current ) + vcpu_unpause(v); + break; + default: domain_unpause(d); + break; + } + } + rcu_read_unlock(&domlist_read_lock); } static void acpi_sleep_prepare(u32 state) @@ -163,6 +193,7 @@ static int enter_state(u32 state) { case ACPI_STATE_S3: do_suspend_lowlevel(); + system_reset_counter++; break; case ACPI_STATE_S5: acpi_enter_sleep_state(ACPI_STATE_S5); diff --git a/xen/arch/x86/copy_page.S b/xen/arch/x86/copy_page.S new file mode 100644 index 0000000000..2fd3e533c6 --- /dev/null +++ b/xen/arch/x86/copy_page.S @@ -0,0 +1,66 @@ +#include <xen/config.h> +#include <asm/page.h> + +#ifdef __i386__ +#define src_reg %esi +#define dst_reg %edi +#define WORD_SIZE 4 +#define tmp1_reg %eax +#define tmp2_reg %edx +#define tmp3_reg %ebx +#define tmp4_reg %ebp +#else +#define src_reg %rsi +#define dst_reg %rdi +#define WORD_SIZE 8 +#define tmp1_reg %r8 +#define tmp2_reg %r9 +#define tmp3_reg %r10 +#define tmp4_reg %r11 +#endif + +ENTRY(copy_page_sse2) +#ifdef __i386__ + push %ebx + push %ebp + push %esi + push %edi + mov 6*4(%esp), src_reg + mov 5*4(%esp), dst_reg +#endif + mov $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx + + prefetchnta 2*4*WORD_SIZE(src_reg) + mov (src_reg), tmp1_reg + mov WORD_SIZE(src_reg), tmp2_reg + mov 2*WORD_SIZE(src_reg), tmp3_reg + mov 3*WORD_SIZE(src_reg), tmp4_reg + +0: prefetchnta 3*4*WORD_SIZE(src_reg) +1: add $4*WORD_SIZE, src_reg + movnti tmp1_reg, (dst_reg) + mov (src_reg), tmp1_reg + dec %ecx + movnti tmp2_reg, WORD_SIZE(dst_reg) + mov WORD_SIZE(src_reg), tmp2_reg + movnti tmp3_reg, 2*WORD_SIZE(dst_reg) + mov 2*WORD_SIZE(src_reg), tmp3_reg + movnti tmp4_reg, 3*WORD_SIZE(dst_reg) + lea 4*WORD_SIZE(dst_reg), dst_reg + mov 3*WORD_SIZE(src_reg), tmp4_reg + jg 0b + jpe 1b + + movnti tmp1_reg, (dst_reg) + movnti tmp2_reg, WORD_SIZE(dst_reg) + movnti tmp3_reg, 2*WORD_SIZE(dst_reg) + movnti tmp4_reg, 3*WORD_SIZE(dst_reg) + +#ifdef __i386__ + pop %edi + pop %esi + pop %ebp + pop %ebx +#endif + sfence + ret diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c index 42c6dc1210..2fcdee836a 100644 --- a/xen/arch/x86/cpu/common.c +++ b/xen/arch/x86/cpu/common.c @@ -564,7 +564,10 @@ void __cpuinit cpu_init(void) { int cpu = smp_processor_id(); struct tss_struct *t = &init_tss[cpu]; - char gdt_load[10]; + struct desc_ptr gdt_desc = { + .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), + .limit = LAST_RESERVED_GDT_BYTE + }; if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -578,9 +581,7 @@ void __cpuinit cpu_init(void) /* Install correct page table. */ write_ptbase(current); - *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; - *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(current); - asm volatile ( "lgdt %0" : "=m" (gdt_load) ); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); /* No nested task. */ asm volatile ("pushf ; andw $0xbfff,(%"__OP"sp) ; popf" ); diff --git a/xen/arch/x86/cpu/mcheck/p4.c b/xen/arch/x86/cpu/mcheck/p4.c index 0b82e65196..65daca06ef 100644 --- a/xen/arch/x86/cpu/mcheck/p4.c +++ b/xen/arch/x86/cpu/mcheck/p4.c @@ -51,7 +51,7 @@ static void intel_thermal_interrupt(struct cpu_user_regs *regs) ack_APIC_irq(); - if (NOW() > next[cpu]) + if (NOW() < next[cpu]) return; next[cpu] = NOW() + MILLISECS(5000); diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index b24f04d2bb..6e6e9ebc21 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -184,7 +184,8 @@ static int setup_compat_l4(struct vcpu *v) /* This page needs to look like a pagetable so that it can be shadowed */ pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1; - l4tab = copy_page(page_to_virt(pg), idle_pg_table); + l4tab = page_to_virt(pg); + copy_page(l4tab, idle_pg_table); l4tab[0] = l4e_empty(); l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] = l4e_from_page(pg, __PAGE_HYPERVISOR); @@ -310,12 +311,7 @@ int vcpu_initialise(struct vcpu *v) if ( is_idle_domain(d) ) { v->arch.schedule_tail = continue_idle_domain; - if ( v->vcpu_id ) - v->arch.cr3 = d->vcpu[0]->arch.cr3; - else if ( !*idle_vcpu ) - v->arch.cr3 = __pa(idle_pg_table); - else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) ) - return -ENOMEM; + v->arch.cr3 = __pa(idle_pg_table); } v->arch.guest_context.ctrlreg[4] = @@ -1172,14 +1168,18 @@ static void paravirt_ctxt_switch_to(struct vcpu *v) } } +static inline int need_full_gdt(struct vcpu *v) +{ + return (!is_hvm_vcpu(v) && !is_idle_vcpu(v)); +} + static void __context_switch(void) { struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); - unsigned int i, cpu = smp_processor_id(); + unsigned int cpu = smp_processor_id(); struct vcpu *p = per_cpu(curr_vcpu, cpu); struct vcpu *n = current; struct desc_struct *gdt; - struct page_info *page; struct desc_ptr gdt_desc; ASSERT(p != n); @@ -1208,16 +1208,19 @@ static void __context_switch(void) gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) : per_cpu(compat_gdt_table, cpu); - page = virt_to_page(gdt); - for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) + if ( need_full_gdt(n) ) { - l1e_write(n->domain->arch.mm_perdomain_pt + - (n->vcpu_id << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE + i, - l1e_from_page(page + i, __PAGE_HYPERVISOR)); + struct page_info *page = virt_to_page(gdt); + unsigned int i; + for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ ) + l1e_write(n->domain->arch.mm_perdomain_pt + + (n->vcpu_id << GDT_LDT_VCPU_SHIFT) + + FIRST_RESERVED_GDT_PAGE + i, + l1e_from_page(page + i, __PAGE_HYPERVISOR)); } - if ( p->vcpu_id != n->vcpu_id ) + if ( need_full_gdt(p) && + ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) ) { gdt_desc.limit = LAST_RESERVED_GDT_BYTE; gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY); @@ -1226,8 +1229,10 @@ static void __context_switch(void) write_ptbase(n); - if ( p->vcpu_id != n->vcpu_id ) + if ( need_full_gdt(n) && + ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) ) { + gdt_desc.limit = LAST_RESERVED_GDT_BYTE; gdt_desc.base = GDT_VIRT_START(n); asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index e026662b02..1c140c5799 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -194,30 +194,6 @@ static void __init process_dom0_ioports_disable(void) } } -/* We run on dom0's page tables for the final part of the build process. */ -static void dom0_pt_enter(struct vcpu *v) -{ - struct desc_ptr gdt_desc = { - .limit = LAST_RESERVED_GDT_BYTE, - .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY) - }; - - asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); - write_ptbase(v); -} - -/* Return to idle domain's page tables. */ -static void dom0_pt_exit(void) -{ - struct desc_ptr gdt_desc = { - .limit = LAST_RESERVED_GDT_BYTE, - .base = GDT_VIRT_START(current) - }; - - write_ptbase(current); - asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); -} - int __init construct_dom0( struct domain *d, unsigned long _image_start, unsigned long image_len, @@ -479,8 +455,9 @@ int __init construct_dom0( /* WARNING: The new domain must have its 'processor' field filled in! */ l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE; - memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE); - for (i = 0; i < 4; i++) { + for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) { + copy_page(l2tab + i * L2_PAGETABLE_ENTRIES, + idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES); l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT); l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] = l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR); @@ -729,7 +706,8 @@ int __init construct_dom0( else update_cr3(v); - dom0_pt_enter(v); + /* We run on dom0's page tables for the final part of the build process. */ + write_ptbase(v); /* Copy the OS image and free temporary buffer. */ elf.dest = (void*)vkern_start; @@ -741,11 +719,11 @@ int __init construct_dom0( (parms.virt_hypercall >= v_end) ) { write_ptbase(current); - local_irq_enable(); printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); return -1; } - hypercall_page_initialise(d, (void *)(unsigned long)parms.virt_hypercall); + hypercall_page_initialise( + d, (void *)(unsigned long)parms.virt_hypercall); } /* Copy the initial ramdisk. */ @@ -826,7 +804,8 @@ int __init construct_dom0( xlat_start_info(si, XLAT_start_info_console_dom0); #endif - dom0_pt_exit(); + /* Return to idle domain's page tables. */ + write_ptbase(current); #if defined(__i386__) /* Destroy low mappings - they were only for our convenience. */ diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index a145583137..7531cfddff 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -1074,11 +1074,24 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c) if ( is_hvm_vcpu(v) ) { + struct segment_register sreg; memset(c.nat->ctrlreg, 0, sizeof(c.nat->ctrlreg)); c.nat->ctrlreg[0] = v->arch.hvm_vcpu.guest_cr[0]; c.nat->ctrlreg[2] = v->arch.hvm_vcpu.guest_cr[2]; c.nat->ctrlreg[3] = v->arch.hvm_vcpu.guest_cr[3]; c.nat->ctrlreg[4] = v->arch.hvm_vcpu.guest_cr[4]; + hvm_get_segment_register(v, x86_seg_cs, &sreg); + c.nat->user_regs.cs = sreg.sel; + hvm_get_segment_register(v, x86_seg_ss, &sreg); + c.nat->user_regs.ss = sreg.sel; + hvm_get_segment_register(v, x86_seg_ds, &sreg); + c.nat->user_regs.ds = sreg.sel; + hvm_get_segment_register(v, x86_seg_es, &sreg); + c.nat->user_regs.es = sreg.sel; + hvm_get_segment_register(v, x86_seg_fs, &sreg); + c.nat->user_regs.fs = sreg.sel; + hvm_get_segment_register(v, x86_seg_gs, &sreg); + c.nat->user_regs.gs = sreg.sel; } else { diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c index 83ab2f1887..d5e84f4ccd 100644 --- a/xen/arch/x86/hpet.c +++ b/xen/arch/x86/hpet.c @@ -265,23 +265,20 @@ int hpet_legacy_irq_tick(void) u64 hpet_setup(void) { static u64 hpet_rate; - static int initialised; + static u32 system_reset_latch; u32 hpet_id, hpet_period, cfg; int i; - if ( initialised ) + if ( system_reset_latch == system_reset_counter ) return hpet_rate; - initialised = 1; - - if ( hpet_address == 0 ) - return 0; + system_reset_latch = system_reset_counter; set_fixmap_nocache(FIX_HPET_BASE, hpet_address); hpet_id = hpet_read32(HPET_ID); - if ( hpet_id == 0 ) + if ( (hpet_id & HPET_ID_REV) == 0 ) { - printk("BAD HPET vendor id.\n"); + printk("BAD HPET revision id.\n"); return 0; } @@ -299,9 +296,9 @@ u64 hpet_setup(void) for ( i = 0; i <= ((hpet_id >> 8) & 31); i++ ) { - cfg = hpet_read32(HPET_T0_CFG + i*0x20); + cfg = hpet_read32(HPET_Tn_CFG(i)); cfg &= ~HPET_TN_ENABLE; - hpet_write32(cfg & ~HPET_TN_ENABLE, HPET_T0_CFG); + hpet_write32(cfg, HPET_Tn_CFG(i)); } cfg = hpet_read32(HPET_CFG); diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index b87f953af1..9bda106ec2 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -1884,6 +1884,25 @@ static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg) return rc; } +static long hvm_vcpu_op( + int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg) +{ + long rc; + + switch ( cmd ) + { + case VCPUOP_register_runstate_memory_area: + case VCPUOP_get_runstate_info: + rc = do_vcpu_op(cmd, vcpuid, arg); + break; + default: + rc = -ENOSYS; + break; + } + + return rc; +} + typedef unsigned long hvm_hypercall_t( unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); @@ -1895,6 +1914,7 @@ typedef unsigned long hvm_hypercall_t( static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = { [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op, [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op, + [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op, HYPERCALL(xen_version), HYPERCALL(event_channel_op), HYPERCALL(sched_op), @@ -1911,9 +1931,29 @@ static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg) return rc; } +static long hvm_vcpu_op_compat32( + int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg) +{ + long rc; + + switch ( cmd ) + { + case VCPUOP_register_runstate_memory_area: + case VCPUOP_get_runstate_info: + rc = compat_vcpu_op(cmd, vcpuid, arg); + break; + default: + rc = -ENOSYS; + break; + } + + return rc; +} + static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = { [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op, [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op, + [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op, HYPERCALL(xen_version), HYPERCALL(event_channel_op), HYPERCALL(sched_op), @@ -1923,6 +1963,7 @@ static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = { static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = { [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32, [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op, + [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op_compat32, HYPERCALL(xen_version), HYPERCALL(event_channel_op), HYPERCALL(sched_op), @@ -2081,7 +2122,7 @@ static int hvmop_set_pci_intx_level( void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip) { - struct domain *d = current->domain; + struct domain *d = v->domain; struct vcpu_guest_context *ctxt; struct segment_register reg; diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c index 77f31a6f7f..d6692d2b3d 100644 --- a/xen/arch/x86/hvm/mtrr.c +++ b/xen/arch/x86/hvm/mtrr.c @@ -392,12 +392,16 @@ uint32_t get_pat_flags(struct vcpu *v, */ if ( pat_entry_value == INVALID_MEM_TYPE ) { - gdprintk(XENLOG_WARNING, - "Conflict occurs for a given guest l1e flags:%x " - "at %"PRIx64" (the effective mm type:%d), " - "because the host mtrr type is:%d\n", - gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type, - shadow_mtrr_type); + struct domain *d = v->domain; + p2m_type_t p2mt; + gfn_to_mfn(d, paddr_to_pfn(gpaddr), &p2mt); + if (p2m_is_ram(p2mt)) + gdprintk(XENLOG_WARNING, + "Conflict occurs for a given guest l1e flags:%x " + "at %"PRIx64" (the effective mm type:%d), " + "because the host mtrr type is:%d\n", + gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type, + shadow_mtrr_type); pat_entry_value = PAT_TYPE_UNCACHABLE; } /* 4. Get the pte flags */ diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index c635f5204a..499cd619d4 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -739,6 +739,23 @@ static void svm_inject_exception( struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb; eventinj_t event = vmcb->eventinj; + switch ( trapnr ) + { + case TRAP_debug: + if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF ) + { + __restore_debug_registers(curr); + vmcb->dr6 |= 0x4000; + } + case TRAP_int3: + if ( curr->domain->debugger_attached ) + { + /* Debug/Int3: Trap to debugger. */ + domain_pause_for_debugger(); + return; + } + } + if ( unlikely(event.fields.v) && (event.fields.type == X86_EVENTTYPE_HW_EXCEPTION) ) { @@ -765,13 +782,6 @@ static void svm_inject_exception( { HVMTRACE_2D(INJ_EXC, trapnr, errcode); } - - if ( (trapnr == TRAP_debug) && - (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) ) - { - __restore_debug_registers(curr); - vmcb->dr6 |= 0x4000; - } } static int svm_event_pending(struct vcpu *v) diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c index 7250de3a7d..7f63699ab2 100644 --- a/xen/arch/x86/hvm/vioapic.c +++ b/xen/arch/x86/hvm/vioapic.c @@ -344,8 +344,8 @@ static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq) } else #endif - target = apic_round_robin(vioapic_domain(vioapic), - vector, deliver_bitmask); + target = apic_lowest_prio(vioapic_domain(vioapic), + deliver_bitmask); if ( target != NULL ) { ioapic_inj_irq(vioapic, target, vector, trig_mode, delivery_mode); diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c index d201af2848..68e9b27632 100644 --- a/xen/arch/x86/hvm/vlapic.c +++ b/xen/arch/x86/hvm/vlapic.c @@ -377,26 +377,30 @@ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode, } /* This function is used by both ioapic and lapic.The bitmap is for vcpu_id. */ -struct vlapic *apic_round_robin( - struct domain *d, uint8_t vector, uint32_t bitmap) +struct vlapic *apic_lowest_prio(struct domain *d, uint32_t bitmap) { - int next, old; - struct vlapic *target = NULL; + int old = d->arch.hvm_domain.irq.round_robin_prev_vcpu; + uint32_t ppr, target_ppr = UINT_MAX; + struct vlapic *vlapic, *target = NULL; + struct vcpu *v; - old = next = d->arch.hvm_domain.irq.round_robin_prev_vcpu; + if ( unlikely((v = d->vcpu[old]) == NULL) ) + return NULL; do { - if ( ++next == MAX_VIRT_CPUS ) - next = 0; - if ( (d->vcpu[next] == NULL) || !test_bit(next, &bitmap) ) - continue; - target = vcpu_vlapic(d->vcpu[next]); - if ( vlapic_enabled(target) ) - break; - target = NULL; - } while ( next != old ); + v = v->next_in_list ? : d->vcpu[0]; + vlapic = vcpu_vlapic(v); + if ( test_bit(v->vcpu_id, &bitmap) && vlapic_enabled(vlapic) && + ((ppr = vlapic_get_ppr(vlapic)) < target_ppr) ) + { + target = vlapic; + target_ppr = ppr; + } + } while ( v->vcpu_id != old ); - d->arch.hvm_domain.irq.round_robin_prev_vcpu = next; + if ( target != NULL ) + d->arch.hvm_domain.irq.round_robin_prev_vcpu = + vlapic_vcpu(target)->vcpu_id; return target; } @@ -456,7 +460,7 @@ int vlapic_ipi( if ( delivery_mode == APIC_DM_LOWEST ) { - target = apic_round_robin(vlapic_domain(v), vector, lpr_map); + target = apic_lowest_prio(vlapic_domain(v), lpr_map); if ( target != NULL ) rc = vlapic_accept_irq(vlapic_vcpu(target), delivery_mode, vector, level, trig_mode); diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c index 6eefb61bfa..cc9e9adde5 100644 --- a/xen/arch/x86/hvm/vmsi.c +++ b/xen/arch/x86/hvm/vmsi.c @@ -152,7 +152,7 @@ int vmsi_deliver(struct domain *d, int pirq) { case dest_LowestPrio: { - target = apic_round_robin(d, vector, deliver_bitmask); + target = apic_lowest_prio(d, deliver_bitmask); if ( target != NULL ) vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode); else diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c index e62c8459f9..daf2e7bcf0 100644 --- a/xen/arch/x86/hvm/vmx/intr.c +++ b/xen/arch/x86/hvm/vmx/intr.c @@ -140,12 +140,12 @@ asmlinkage void vmx_intr_assist(void) if ( intack.source == hvm_intsrc_nmi ) { - vmx_inject_nmi(v); + vmx_inject_nmi(); } else { HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0); - vmx_inject_extint(v, intack.vector); + vmx_inject_extint(intack.vector); pt_intr_post(v, intack); } diff --git a/xen/arch/x86/hvm/vmx/realmode.c b/xen/arch/x86/hvm/vmx/realmode.c index 5d13f4e60b..4af2848406 100644 --- a/xen/arch/x86/hvm/vmx/realmode.c +++ b/xen/arch/x86/hvm/vmx/realmode.c @@ -69,7 +69,8 @@ static void realmode_deliver_exception( frame[1] = csr->sel; frame[2] = regs->eflags & ~X86_EFLAGS_RF; - if ( hvmemul_ctxt->ctxt.addr_size == 32 ) + /* We can't test hvmemul_ctxt->ctxt.sp_size: it may not be initialised. */ + if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ) { regs->esp -= 6; pstk = regs->esp; @@ -148,17 +149,25 @@ static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) hvmemul_ctxt->exn_insn_len = 0; } - if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE ) + if ( unlikely(curr->domain->debugger_attached) && + ((hvmemul_ctxt->exn_vector == TRAP_debug) || + (hvmemul_ctxt->exn_vector == TRAP_int3)) ) + { + domain_pause_for_debugger(); + } + else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE ) { gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n", hvmemul_ctxt->exn_vector); goto fail; } - - realmode_deliver_exception( - hvmemul_ctxt->exn_vector, - hvmemul_ctxt->exn_insn_len, - hvmemul_ctxt); + else + { + realmode_deliver_exception( + hvmemul_ctxt->exn_vector, + hvmemul_ctxt->exn_insn_len, + hvmemul_ctxt); + } } return; diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index 8fdeb40008..794b9cdeea 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -444,6 +444,8 @@ static void vmx_set_host_env(struct vcpu *v) { unsigned int cpu = smp_processor_id(); + __vmwrite(HOST_GDTR_BASE, + (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)); __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3); @@ -541,9 +543,6 @@ static int construct_vmcs(struct vcpu *v) __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0)); __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE)); - /* Host GDTR base. */ - __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v)); - /* Host data selectors. */ __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index fb12efe46a..4156f02ad1 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -187,7 +187,7 @@ static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs) check_long_mode: if ( !(hvm_long_mode_enabled(v)) ) { - vmx_inject_hw_exception(v, TRAP_gp_fault, 0); + vmx_inject_hw_exception(TRAP_gp_fault, 0); return HNDL_exception_raised; } break; @@ -284,7 +284,7 @@ static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs) uncanonical_address: HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx); gp_fault: - vmx_inject_hw_exception(v, TRAP_gp_fault, 0); + vmx_inject_hw_exception(TRAP_gp_fault, 0); exception_raised: return HNDL_exception_raised; } @@ -1094,8 +1094,7 @@ void ept_sync_domain(struct domain *d) } } -static void __vmx_inject_exception( - struct vcpu *v, int trap, int type, int error_code) +static void __vmx_inject_exception(int trap, int type, int error_code) { unsigned long intr_fields; @@ -1114,17 +1113,29 @@ static void __vmx_inject_exception( } __vmwrite(VM_ENTRY_INTR_INFO, intr_fields); - - if ( trap == TRAP_page_fault ) - HVMTRACE_LONG_2D(PF_INJECT, error_code, - TRC_PAR_LONG(v->arch.hvm_vcpu.guest_cr[2])); - else - HVMTRACE_2D(INJ_EXC, trap, error_code); } -void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code) +void vmx_inject_hw_exception(int trap, int error_code) { unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO); + struct vcpu *curr = current; + + switch ( trap ) + { + case TRAP_debug: + if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF ) + { + __restore_debug_registers(curr); + write_debugreg(6, read_debugreg(6) | 0x4000); + } + case TRAP_int3: + if ( curr->domain->debugger_attached ) + { + /* Debug/Int3: Trap to debugger. */ + domain_pause_for_debugger(); + return; + } + } if ( unlikely(intr_info & INTR_INFO_VALID_MASK) && (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) ) @@ -1134,37 +1145,34 @@ void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code) error_code = 0; } - __vmx_inject_exception(v, trap, X86_EVENTTYPE_HW_EXCEPTION, error_code); + __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code); + + if ( trap == TRAP_page_fault ) + HVMTRACE_LONG_2D(PF_INJECT, error_code, + TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2])); + else + HVMTRACE_2D(INJ_EXC, trap, error_code); } -void vmx_inject_extint(struct vcpu *v, int trap) +void vmx_inject_extint(int trap) { - __vmx_inject_exception(v, trap, X86_EVENTTYPE_EXT_INTR, + __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR, HVM_DELIVER_NO_ERROR_CODE); } -void vmx_inject_nmi(struct vcpu *v) +void vmx_inject_nmi(void) { - __vmx_inject_exception(v, 2, X86_EVENTTYPE_NMI, + __vmx_inject_exception(2, X86_EVENTTYPE_NMI, HVM_DELIVER_NO_ERROR_CODE); } static void vmx_inject_exception( unsigned int trapnr, int errcode, unsigned long cr2) { - struct vcpu *curr = current; - - vmx_inject_hw_exception(curr, trapnr, errcode); - if ( trapnr == TRAP_page_fault ) - curr->arch.hvm_vcpu.guest_cr[2] = cr2; + current->arch.hvm_vcpu.guest_cr[2] = cr2; - if ( (trapnr == TRAP_debug) && - (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) ) - { - __restore_debug_registers(curr); - write_debugreg(6, read_debugreg(6) | 0x4000); - } + vmx_inject_hw_exception(trapnr, errcode); } static int vmx_event_pending(struct vcpu *v) @@ -1315,7 +1323,7 @@ static void __update_guest_eip(unsigned long inst_len) } if ( regs->eflags & X86_EFLAGS_TF ) - vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0); + vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE); } static void vmx_fpu_dirty_intercept(void) @@ -1636,7 +1644,6 @@ static int vmx_msr_read_intercept(struct cpu_user_regs *regs) { u64 msr_content = 0; u32 ecx = regs->ecx, eax, edx; - struct vcpu *v = current; HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx); @@ -1712,7 +1719,7 @@ done: return X86EMUL_OKAY; gp_fault: - vmx_inject_hw_exception(v, TRAP_gp_fault, 0); + vmx_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } @@ -1849,7 +1856,7 @@ static int vmx_msr_write_intercept(struct cpu_user_regs *regs) if ( (rc < 0) || (vmx_add_host_load_msr(ecx) < 0) ) - vmx_inject_hw_exception(v, TRAP_machine_check, 0); + vmx_inject_hw_exception(TRAP_machine_check, 0); else { __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); @@ -1889,7 +1896,7 @@ static int vmx_msr_write_intercept(struct cpu_user_regs *regs) return X86EMUL_OKAY; gp_fault: - vmx_inject_hw_exception(v, TRAP_gp_fault, 0); + vmx_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } @@ -2197,7 +2204,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) } v->arch.hvm_vcpu.guest_cr[2] = exit_qualification; - vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code); + vmx_inject_hw_exception(TRAP_page_fault, regs->error_code); break; case TRAP_nmi: if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) != @@ -2317,7 +2324,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) case EXIT_REASON_VMWRITE: case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: - vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); + vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); break; case EXIT_REASON_TPR_BELOW_THRESHOLD: @@ -2326,7 +2333,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) case EXIT_REASON_IO_INSTRUCTION: case EXIT_REASON_APIC_ACCESS: if ( !handle_mmio() ) - hvm_inject_exception(TRAP_gp_fault, 0, 0); + vmx_inject_hw_exception(TRAP_gp_fault, 0); break; case EXIT_REASON_INVD: diff --git a/xen/arch/x86/hvm/vmx/vpmu_core2.c b/xen/arch/x86/hvm/vmx/vpmu_core2.c index 9e0822ffdf..ff1783b2c0 100644 --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c @@ -335,7 +335,7 @@ static int core2_vpmu_do_wrmsr(struct cpu_user_regs *regs) case MSR_CORE_PERF_GLOBAL_STATUS: gdprintk(XENLOG_INFO, "Can not write readonly MSR: " "MSR_PERF_GLOBAL_STATUS(0x38E)!\n"); - vmx_inject_hw_exception(current, TRAP_gp_fault, 0); + vmx_inject_hw_exception(TRAP_gp_fault, 0); return 1; case MSR_IA32_PEBS_ENABLE: if ( msr_content & 1 ) diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c index 555d937c4d..79317085b7 100644 --- a/xen/arch/x86/i8259.c +++ b/xen/arch/x86/i8259.c @@ -390,7 +390,7 @@ void __init init_IRQ(void) init_8259A(0); - for ( i = 0; i < NR_IRQS; i++ ) + for ( i = 0; i < NR_VECTORS; i++ ) { irq_desc[i].status = IRQ_DISABLED; irq_desc[i].handler = &no_irq_type; diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c index aa21f18104..15d2b6b851 100644 --- a/xen/arch/x86/io_apic.c +++ b/xen/arch/x86/io_apic.c @@ -665,7 +665,7 @@ static inline int IO_APIC_irq_trigger(int irq) } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; +u8 irq_vector[NR_IRQS] __read_mostly; int free_irq_vector(int vector) { @@ -686,7 +686,7 @@ int assign_irq_vector(int irq) static unsigned current_vector = FIRST_DYNAMIC_VECTOR; unsigned vector; - BUG_ON(irq >= NR_IRQ_VECTORS); + BUG_ON(irq >= NR_IRQS); spin_lock(&vector_lock); @@ -1547,20 +1547,10 @@ static struct hw_interrupt_type ioapic_level_type = { .set_affinity = set_ioapic_affinity_vector, }; -static void mask_msi_vector(unsigned int vector) -{ - mask_msi_irq(vector); -} - -static void unmask_msi_vector(unsigned int vector) -{ - unmask_msi_irq(vector); -} - static unsigned int startup_msi_vector(unsigned int vector) { dprintk(XENLOG_INFO, "startup msi vector %x\n", vector); - unmask_msi_irq(vector); + unmask_msi_vector(vector); return 0; } @@ -1576,13 +1566,13 @@ static void end_msi_vector(unsigned int vector) static void shutdown_msi_vector(unsigned int vector) { dprintk(XENLOG_INFO, "shutdown msi vector %x\n", vector); - mask_msi_irq(vector); + mask_msi_vector(vector); } static void set_msi_affinity_vector(unsigned int vector, cpumask_t cpu_mask) { set_native_irq_info(vector, cpu_mask); - set_msi_irq_affinity(vector, cpu_mask); + set_msi_affinity(vector, cpu_mask); } /* @@ -2196,7 +2186,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val) if ( new_rte.vector >= FIRST_DYNAMIC_VECTOR ) new_irq = vector_irq[new_rte.vector]; - if ( (old_irq != new_irq) && (old_irq != -1) && IO_APIC_IRQ(old_irq) ) + if ( (old_irq != new_irq) && (old_irq >= 0) && IO_APIC_IRQ(old_irq) ) { if ( irq_desc[IO_APIC_VECTOR(old_irq)].action ) { @@ -2208,7 +2198,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val) remove_pin_at_irq(old_irq, apic, pin); } - if ( (new_irq != -1) && IO_APIC_IRQ(new_irq) ) + if ( (new_irq >= 0) && IO_APIC_IRQ(new_irq) ) { if ( irq_desc[IO_APIC_VECTOR(new_irq)].action ) { diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c index efb73ad011..4e3bed2228 100644 --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -24,7 +24,7 @@ int opt_noirqbalance = 0; boolean_param("noirqbalance", opt_noirqbalance); -irq_desc_t irq_desc[NR_IRQS]; +irq_desc_t irq_desc[NR_VECTORS]; static void __do_IRQ_guest(int vector); @@ -206,7 +206,7 @@ struct pending_eoi { static DEFINE_PER_CPU(struct pending_eoi, pending_eoi[NR_VECTORS]); #define pending_eoi_sp(p) ((p)[NR_VECTORS-1].vector) -static struct timer irq_guest_eoi_timer[NR_IRQS]; +static struct timer irq_guest_eoi_timer[NR_VECTORS]; static void irq_guest_eoi_timer_fn(void *data) { irq_desc_t *desc = data; @@ -463,14 +463,19 @@ int pirq_acktype(struct domain *d, int irq) /* * Edge-triggered IO-APIC and LAPIC interrupts need no final * acknowledgement: we ACK early during interrupt processing. - * MSIs are treated as edge-triggered interrupts. */ if ( !strcmp(desc->handler->typename, "IO-APIC-edge") || - !strcmp(desc->handler->typename, "local-APIC-edge") || - !strcmp(desc->handler->typename, "PCI-MSI") ) + !strcmp(desc->handler->typename, "local-APIC-edge") ) return ACKTYPE_NONE; /* + * MSIs are treated as edge-triggered interrupts, except + * when there is no proper way to mask them. + */ + if ( desc->handler == &pci_msi_type ) + return msi_maskable_irq(desc->msi_desc) ? ACKTYPE_NONE : ACKTYPE_EOI; + + /* * Level-triggered IO-APIC interrupts need to be acknowledged on the CPU * on which they were received. This is because we tickle the LAPIC to EOI. */ @@ -765,15 +770,15 @@ int get_free_pirq(struct domain *d, int type, int index) if ( type == MAP_PIRQ_TYPE_GSI ) { - for ( i = 16; i < NR_PIRQS; i++ ) + for ( i = 16; i < NR_IRQS; i++ ) if ( !d->arch.pirq_vector[i] ) break; - if ( i == NR_PIRQS ) + if ( i == NR_IRQS ) return -ENOSPC; } else { - for ( i = NR_PIRQS - 1; i >= 16; i-- ) + for ( i = NR_IRQS - 1; i >= 16; i-- ) if ( !d->arch.pirq_vector[i] ) break; if ( i == 16 ) @@ -800,7 +805,7 @@ int map_domain_pirq( if ( !IS_PRIV(current->domain) ) return -EPERM; - if ( pirq < 0 || pirq >= NR_PIRQS || vector < 0 || vector >= NR_VECTORS ) + if ( pirq < 0 || pirq >= NR_IRQS || vector < 0 || vector >= NR_VECTORS ) { dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or vector %d\n", d->domain_id, pirq, vector); @@ -857,7 +862,7 @@ int unmap_domain_pirq(struct domain *d, int pirq) int vector, ret = 0; bool_t forced_unbind; - if ( (pirq < 0) || (pirq >= NR_PIRQS) ) + if ( (pirq < 0) || (pirq >= NR_IRQS) ) return -EINVAL; if ( !IS_PRIV(current->domain) ) @@ -921,7 +926,7 @@ void free_domain_pirqs(struct domain *d) spin_lock(&d->event_lock); - for ( i = 0; i < NR_PIRQS; i++ ) + for ( i = 0; i < NR_IRQS; i++ ) if ( d->arch.pirq_vector[i] > 0 ) unmap_domain_pirq(d, i); @@ -1001,28 +1006,30 @@ __initcall(setup_dump_irqs); void fixup_irqs(cpumask_t map) { - unsigned int irq, sp; + unsigned int vector, sp; static int warned; irq_guest_action_t *action; struct pending_eoi *peoi; /* Direct all future interrupts away from this CPU. */ - for ( irq = 0; irq < NR_IRQS; irq++ ) + for ( vector = 0; vector < NR_VECTORS; vector++ ) { cpumask_t mask; - if ( irq == 2 ) + if ( vector_to_irq(vector) == 2 ) continue; - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, irq_desc[vector].affinity, map); if ( any_online_cpu(mask) == NR_CPUS ) { - printk("Breaking affinity for irq %i\n", irq); + printk("Breaking affinity for vector %u (irq %i)\n", + vector, vector_to_irq(vector)); mask = map; } - if ( irq_desc[irq].handler->set_affinity ) - irq_desc[irq].handler->set_affinity(irq, mask); - else if ( irq_desc[irq].action && !(warned++) ) - printk("Cannot set affinity for irq %i\n", irq); + if ( irq_desc[vector].handler->set_affinity ) + irq_desc[vector].handler->set_affinity(vector, mask); + else if ( irq_desc[vector].action && !(warned++) ) + printk("Cannot set affinity for irq %u (irq %i)\n", + vector, vector_to_irq(vector)); } /* Service any interrupts that beat us in the re-direction race. */ @@ -1031,11 +1038,11 @@ void fixup_irqs(cpumask_t map) local_irq_disable(); /* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */ - for ( irq = 0; irq < NR_IRQS; irq++ ) + for ( vector = 0; vector < NR_VECTORS; vector++ ) { - if ( !(irq_desc[irq].status & IRQ_GUEST) ) + if ( !(irq_desc[vector].status & IRQ_GUEST) ) continue; - action = (irq_guest_action_t *)irq_desc[irq].action; + action = (irq_guest_action_t *)irq_desc[vector].action; cpu_clear(smp_processor_id(), action->cpu_eoi_map); } diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 97359e7e94..2adc1ed2c7 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -160,6 +160,9 @@ unsigned long total_pages; #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT) +int opt_allow_hugepage; +boolean_param("allowhugepage", opt_allow_hugepage); + #define l1_disallow_mask(d) \ ((d != dom_io) && \ (rangeset_is_empty((d)->iomem_caps) && \ @@ -586,6 +589,28 @@ static int get_page_and_type_from_pagenr(unsigned long page_nr, return rc; } +static int get_data_page( + struct page_info *page, struct domain *d, int writeable) +{ + int rc; + + if ( writeable ) + rc = get_page_and_type(page, d, PGT_writable_page); + else + rc = get_page(page, d); + + return rc; +} + +static void put_data_page( + struct page_info *page, int writeable) +{ + if ( writeable ) + put_page_and_type(page); + else + put_page(page); +} + /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: @@ -700,10 +725,9 @@ get_page_from_l1e( * contribute to writeable mapping refcounts. (This allows the * qemu-dm helper process in dom0 to map the domain's memory without * messing up the count of "real" writable mappings.) */ - okay = (((l1f & _PAGE_RW) && - !(unlikely(paging_mode_external(d) && (d != curr->domain)))) - ? get_page_and_type(page, d, PGT_writable_page) - : get_page(page, d)); + okay = get_data_page( + page, d, + (l1f & _PAGE_RW) && !(paging_mode_external(d) && (d != curr->domain))); if ( !okay ) { MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte @@ -751,6 +775,7 @@ static int get_page_from_l2e( l2_pgentry_t l2e, unsigned long pfn, struct domain *d) { + unsigned long mfn = l2e_get_pfn(l2e); int rc; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) @@ -762,10 +787,37 @@ get_page_from_l2e( return -EINVAL; } - rc = get_page_and_type_from_pagenr( - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0); - if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) - rc = 0; + if ( !(l2e_get_flags(l2e) & _PAGE_PSE) ) + { + rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; + } + else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) ) + { + rc = -EINVAL; + } + else + { + unsigned long m = mfn; + int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW); + + do { + rc = get_data_page(mfn_to_page(m), d, writeable); + if ( unlikely(!rc) ) + { + while ( m-- > mfn ) + put_data_page(mfn_to_page(m), writeable); + return -EINVAL; + } + } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) ); + +#ifdef __x86_64__ + map_pages_to_xen( + (unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES, + PAGE_HYPERVISOR | l2e_get_flags(l2e)); +#endif + } return rc; } @@ -954,13 +1006,24 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) */ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) { - if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && - (l2e_get_pfn(l2e) != pfn) ) + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) ) + return 1; + + if ( l2e_get_flags(l2e) & _PAGE_PSE ) + { + unsigned long mfn = l2e_get_pfn(l2e), m = mfn; + int writeable = l2e_get_flags(l2e) & _PAGE_RW; + ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1))); + do { + put_data_page(mfn_to_page(m), writeable); + } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) ); + } + else { put_page_and_type(l2e_get_page(l2e)); - return 0; } - return 1; + + return 0; } static int __put_page_type(struct page_info *, int preemptible); @@ -1541,6 +1604,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, struct domain *d = curr->domain; unsigned long mfn; struct page_info *l1pg = mfn_to_page(gl1mfn); + p2m_type_t p2mt; int rc = 1; page_lock(l1pg); @@ -1558,8 +1622,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { /* Translate foreign guest addresses. */ - mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e)); - if ( unlikely(mfn == INVALID_MFN) ) + mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt)); + if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) ) return page_unlock(l1pg), 0; ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0); nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e)); @@ -3332,6 +3396,10 @@ int create_grant_host_mapping(uint64_t addr, unsigned long frame, if ( !(flags & GNTMAP_readonly) ) l1e_add_flags(pte,_PAGE_RW); + l1e_add_flags(pte, + ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0) + & _PAGE_AVAIL); + l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5)); if ( flags & GNTMAP_contains_pte ) @@ -4227,7 +4295,7 @@ int map_pages_to_xen( { if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) flush_flags |= FLUSH_TLB_GLOBAL; - if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) & + if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) & PAGE_CACHE_ATTRS ) flush_flags |= FLUSH_CACHE; flush_area(virt, flush_flags); diff --git a/xen/arch/x86/mm/Makefile b/xen/arch/x86/mm/Makefile index 79b25962ac..a399c8ed14 100644 --- a/xen/arch/x86/mm/Makefile +++ b/xen/arch/x86/mm/Makefile @@ -3,3 +3,9 @@ subdir-y += hap obj-y += paging.o obj-y += p2m.o +obj-y += guest_walk_2.o +obj-y += guest_walk_3.o +obj-$(x86_64) += guest_walk_4.o + +guest_walk_%.o: guest_walk.c $(HDRS) Makefile + $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@ diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c new file mode 100644 index 0000000000..5d532800aa --- /dev/null +++ b/xen/arch/x86/mm/guest_walk.c @@ -0,0 +1,260 @@ +/****************************************************************************** + * arch/x86/mm/guest_walk.c + * + * Pagetable walker for guest memory accesses. + * + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/paging.h> +#include <xen/domain_page.h> +#include <xen/sched.h> +#include <asm/page.h> +#include <asm/guest_pt.h> + + +/* Flags that are needed in a pagetable entry, with the sense of NX inverted */ +static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) +{ + static uint32_t flags[] = { + /* I/F - Usr Wr */ + /* 0 0 0 0 */ _PAGE_PRESENT, + /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW, + /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER, + /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, + /* 0 1 0 0 */ _PAGE_PRESENT, + /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW, + /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER, + /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, + /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, + /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, + /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, + /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, + /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, + /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, + /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, + /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, + }; + + /* Don't demand not-NX if the CPU wouldn't enforce it. */ + if ( !guest_supports_nx(v) ) + pfec &= ~PFEC_insn_fetch; + + /* Don't demand R/W if the CPU wouldn't enforce it. */ + if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) + && !(pfec & PFEC_user_mode) ) + pfec &= ~PFEC_write_access; + + return flags[(pfec & 0x1f) >> 1]; +} + +/* Modify a guest pagetable entry to set the Accessed and Dirty bits. + * Returns non-zero if it actually writes to guest memory. */ +static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) +{ + guest_intpte_t old, new; + + old = *(guest_intpte_t *)walk_p; + new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0); + if ( old != new ) + { + /* Write the new entry into the walk, and try to write it back + * into the guest table as well. If the guest table has changed + * under out feet then leave it alone. */ + *(guest_intpte_t *)walk_p = new; + if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) + return 1; + } + return 0; +} + + +/* Walk the guest pagetables, after the manner of a hardware walker. */ +uint32_t +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, + uint32_t pfec, mfn_t top_mfn, void *top_map) +{ + struct domain *d = v->domain; + p2m_type_t p2mt; + guest_l1e_t *l1p = NULL; + guest_l2e_t *l2p = NULL; +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + guest_l3e_t *l3p = NULL; + guest_l4e_t *l4p; +#endif + uint32_t gflags, mflags, rc = 0; + int pse; + + perfc_incr(guest_walk); + memset(gw, 0, sizeof(*gw)); + gw->va = va; + + /* Mandatory bits that must be set in every entry. We invert NX, to + * calculate as if there were an "X" bit that allowed access. + * We will accumulate, in rc, the set of flags that are missing. */ + mflags = mandatory_flags(v, pfec); + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + + /* Get the l4e from the top level table and check its flags*/ + gw->l4mfn = top_mfn; + l4p = (guest_l4e_t *) top_map; + gw->l4e = l4p[guest_l4_table_offset(va)]; + gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + if ( rc & _PAGE_PRESENT ) goto out; + + /* Map the l3 table */ + gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt); + if ( !p2m_is_ram(p2mt) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + ASSERT(mfn_valid(mfn_x(gw->l3mfn))); + + /* Get the l3e and check its flags*/ + l3p = map_domain_page(mfn_x(gw->l3mfn)); + gw->l3e = l3p[guest_l3_table_offset(va)]; + gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + if ( rc & _PAGE_PRESENT ) + goto out; + +#else /* PAE only... */ + + /* Get the l3e and check its flag */ + gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)]; + if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + +#endif /* PAE or 64... */ + + /* Map the l2 table */ + gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt); + if ( !p2m_is_ram(p2mt) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + ASSERT(mfn_valid(mfn_x(gw->l2mfn))); + + /* Get the l2e */ + l2p = map_domain_page(mfn_x(gw->l2mfn)); + gw->l2e = l2p[guest_l2_table_offset(va)]; + +#else /* 32-bit only... */ + + /* Get l2e from the top level table */ + gw->l2mfn = top_mfn; + l2p = (guest_l2e_t *) top_map; + gw->l2e = l2p[guest_l2_table_offset(va)]; + +#endif /* All levels... */ + + gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + if ( rc & _PAGE_PRESENT ) + goto out; + + pse = (guest_supports_superpages(v) && + (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); + + if ( pse ) + { + /* Special case: this guest VA is in a PSE superpage, so there's + * no guest l1e. We make one up so that the propagation code + * can generate a shadow l1 table. Start with the gfn of the + * first 4k-page of the superpage. */ + gfn_t start = guest_l2e_get_gfn(gw->l2e); + /* Grant full access in the l1e, since all the guest entry's + * access controls are enforced in the shadow l2e. */ + int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| + _PAGE_ACCESSED|_PAGE_DIRTY); + /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 + * of the level 1. */ + if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) + flags |= _PAGE_PAT; + /* Copy the cache-control bits to the l1 as well, because we + * can't represent PAT in the (non-PSE) shadow l2e. :( + * This could cause problems if a guest ever maps an area of + * memory with superpages using more than one caching mode. */ + flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD); + /* Increment the pfn by the right number of 4k pages. + * The ~0x1 is to mask out the PAT bit mentioned above. */ + start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); + gw->l1e = guest_l1e_from_gfn(start, flags); + gw->l1mfn = _mfn(INVALID_MFN); + } + else + { + /* Not a superpage: carry on and find the l1e. */ + gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt); + if ( !p2m_is_ram(p2mt) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + ASSERT(mfn_valid(mfn_x(gw->l1mfn))); + l1p = map_domain_page(mfn_x(gw->l1mfn)); + gw->l1e = l1p[guest_l1_table_offset(va)]; + gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + } + + /* Go back and set accessed and dirty bits only if the walk was a + * success. Although the PRMs say higher-level _PAGE_ACCESSED bits + * get set whenever a lower-level PT is used, at least some hardware + * walkers behave this way. */ + if ( rc == 0 ) + { +#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */ + if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) ) + paging_mark_dirty(d, mfn_x(gw->l4mfn)); + if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) ) + paging_mark_dirty(d, mfn_x(gw->l3mfn)); +#endif + if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e, + (pse && (pfec & PFEC_write_access))) ) + paging_mark_dirty(d, mfn_x(gw->l2mfn)); + if ( !pse ) + { + if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, + (pfec & PFEC_write_access)) ) + paging_mark_dirty(d, mfn_x(gw->l1mfn)); + } + } + + out: +#if GUEST_PAGING_LEVELS == 4 + if ( l3p ) unmap_domain_page(l3p); +#endif +#if GUEST_PAGING_LEVELS >= 3 + if ( l2p ) unmap_domain_page(l2p); +#endif + if ( l1p ) unmap_domain_page(l1p); + + return rc; +} diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c index f1c54983d7..425031508d 100644 --- a/xen/arch/x86/mm/hap/guest_walk.c +++ b/xen/arch/x86/mm/hap/guest_walk.c @@ -19,160 +19,71 @@ * Place - Suite 330, Boston, MA 02111-1307 USA. */ -#include <xen/config.h> -#include <xen/types.h> -#include <xen/mm.h> + #include <xen/domain_page.h> -#include <asm/page.h> -#include <xen/event.h> +#include <xen/paging.h> +#include <xen/config.h> #include <xen/sched.h> -#include <asm/hvm/svm/vmcb.h> -#include <asm/domain.h> -#include <asm/paging.h> -#include <asm/p2m.h> -#include <asm/hap.h> - -#include "private.h" #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##level #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels) -#if GUEST_PAGING_LEVELS > CONFIG_PAGING_LEVELS +#if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS -unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)( - struct vcpu *v, unsigned long gva, uint32_t *pfec) -{ - gdprintk(XENLOG_ERR, - "Guest paging level is greater than host paging level!\n"); - domain_crash(v->domain); - return INVALID_GFN; -} - -#else - -#if GUEST_PAGING_LEVELS == 2 -#include "../page-guest32.h" -#define l1_pgentry_t l1_pgentry_32_t -#define l2_pgentry_t l2_pgentry_32_t -#undef l2e_get_flags -#define l2e_get_flags(x) l2e_get_flags_32(x) -#undef l1e_get_flags -#define l1e_get_flags(x) l1e_get_flags_32(x) -#endif +#include <asm/guest_pt.h> unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)( struct vcpu *v, unsigned long gva, uint32_t *pfec) { - unsigned long gcr3 = v->arch.hvm_vcpu.guest_cr[3]; - int mode = GUEST_PAGING_LEVELS; - int lev, index; - paddr_t gpa = 0; - unsigned long gpfn, mfn; + unsigned long cr3; + uint32_t missing; + mfn_t top_mfn; + void *top_map; p2m_type_t p2mt; - int success = 1; + walk_t gw; - l1_pgentry_t *l1e; - l2_pgentry_t *l2e; -#if GUEST_PAGING_LEVELS >= 3 - l3_pgentry_t *l3e; -#endif -#if GUEST_PAGING_LEVELS >= 4 - l4_pgentry_t *l4e; -#endif - - gpfn = (gcr3 >> PAGE_SHIFT); - for ( lev = mode; lev >= 1; lev-- ) + /* Get the top-level table's MFN */ + cr3 = v->arch.hvm_vcpu.guest_cr[3]; + top_mfn = gfn_to_mfn(v->domain, _gfn(cr3 >> PAGE_SHIFT), &p2mt); + if ( !p2m_is_ram(p2mt) ) { - mfn = mfn_x(gfn_to_mfn_current(gpfn, &p2mt)); - if ( !p2m_is_ram(p2mt) ) - { - HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva, - lev); - success = 0; - break; - } - ASSERT(mfn_valid(mfn)); - - index = (gva >> PT_SHIFT[mode][lev]) & (PT_ENTRIES[mode][lev]-1); - -#if GUEST_PAGING_LEVELS >= 4 - if ( lev == 4 ) - { - l4e = map_domain_page(mfn); - if ( !(l4e_get_flags(l4e[index]) & _PAGE_PRESENT) ) - { - HAP_PRINTK("Level 4 entry not present at index = %d\n", index); - success = 0; - } - gpfn = l4e_get_pfn(l4e[index]); - unmap_domain_page(l4e); - } -#endif + pfec[0] &= ~PFEC_page_present; + return INVALID_GFN; + } -#if GUEST_PAGING_LEVELS >= 3 - if ( lev == 3 ) - { - l3e = map_domain_page(mfn); + /* Map the top-level table and call the tree-walker */ + ASSERT(mfn_valid(mfn_x(top_mfn))); + top_map = map_domain_page(mfn_x(top_mfn)); #if GUEST_PAGING_LEVELS == 3 - index += ((gcr3 >> 5) & 127) * 4; -#endif - if ( !(l3e_get_flags(l3e[index]) & _PAGE_PRESENT) ) - { - HAP_PRINTK("Level 3 entry not present at index = %d\n", index); - success = 0; - } - gpfn = l3e_get_pfn(l3e[index]); - unmap_domain_page(l3e); - } + top_map += (cr3 & ~(PAGE_MASK | 31)); #endif + missing = guest_walk_tables(v, gva, &gw, pfec[0], top_mfn, top_map); + unmap_domain_page(top_map); + + /* Interpret the answer */ + if ( missing == 0 ) + return gfn_x(guest_l1e_get_gfn(gw.l1e)); + + if ( missing & _PAGE_PRESENT ) + pfec[0] &= ~PFEC_page_present; + + return INVALID_GFN; +} - if ( lev == 2 ) - { - l2e = map_domain_page(mfn); - if ( !(l2e_get_flags(l2e[index]) & _PAGE_PRESENT) ) - { - HAP_PRINTK("Level 2 entry not present at index = %d\n", index); - success = 0; - } - - if ( l2e_get_flags(l2e[index]) & _PAGE_PSE ) - { - paddr_t mask = ((paddr_t)1 << PT_SHIFT[mode][2]) - 1; - HAP_PRINTK("guest page table is PSE\n"); - gpa = (l2e_get_intpte(l2e[index]) & ~mask) + (gva & mask); - unmap_domain_page(l2e); - break; /* last level page table, jump out from here */ - } - - gpfn = l2e_get_pfn(l2e[index]); - unmap_domain_page(l2e); - } - - if ( lev == 1 ) - { - l1e = map_domain_page(mfn); - if ( !(l1e_get_flags(l1e[index]) & _PAGE_PRESENT) ) - { - HAP_PRINTK("Level 1 entry not present at index = %d\n", index); - success = 0; - } - gpfn = l1e_get_pfn(l1e[index]); - gpa = (l1e_get_intpte(l1e[index]) & PAGE_MASK) + (gva &~PAGE_MASK); - unmap_domain_page(l1e); - } - - if ( success != 1 ) /* error happened, jump out */ - break; - } - - gpa &= PADDR_MASK; - HAP_PRINTK("success = %d, gva = %lx, gpa = %lx\n", success, gva, gpa); +#else - return (!success ? INVALID_GFN : ((paddr_t)gpa >> PAGE_SHIFT)); +unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)( + struct vcpu *v, unsigned long gva, uint32_t *pfec) +{ + gdprintk(XENLOG_ERR, + "Guest paging level is greater than host paging level!\n"); + domain_crash(v->domain); + return INVALID_GFN; } #endif + /* * Local variables: * mode: C diff --git a/xen/arch/x86/mm/hap/private.h b/xen/arch/x86/mm/hap/private.h index 00bed88db4..7b06e7df63 100644 --- a/xen/arch/x86/mm/hap/private.h +++ b/xen/arch/x86/mm/hap/private.h @@ -20,9 +20,6 @@ #ifndef __HAP_PRIVATE_H__ #define __HAP_PRIVATE_H__ -#include <asm/flushtlb.h> -#include <asm/hvm/support.h> - /********************************************/ /* GUEST TRANSLATION FUNCS */ /********************************************/ @@ -33,36 +30,5 @@ unsigned long hap_gva_to_gfn_3level(struct vcpu *v, unsigned long gva, unsigned long hap_gva_to_gfn_4level(struct vcpu *v, unsigned long gva, uint32_t *pfec); -/********************************************/ -/* MISC DEFINITIONS */ -/********************************************/ - -/* PT_SHIFT describes the amount by which a virtual address is shifted right - * to right justify the portion to be used for indexing into a page - * table, given the guest memory model (i.e. number of levels) and the level - * of the page table being accessed. The idea is from Virtual Iron's code. - */ -static const int PT_SHIFT[][5] = - { /* ------ level ------ nr_levels */ - /* 1 2 3 4 */ - { 0, 0, 0, 0, 0}, /* 0 not used */ - { 0, 0, 0, 0, 0}, /* 1 not used */ - { 0, 12, 22, 0, 0}, /* 2 */ - { 0, 12, 21, 30, 0}, /* 3 */ - { 0, 12, 21, 30, 39} /* 4 */ - }; - -/* PT_ENTRIES describes the number of entries in a page table, given the - * memory model (i.e. number of levels) and the level of the page table - * being considered. This idea from Virtual Iron's shadow code*/ -static const int PT_ENTRIES[][5] = - { /* ------ level ------ nr_levels */ - /* 1 2 3 4 */ - { 0, 0, 0, 0, 0}, /* 0 not used */ - { 0, 0, 0, 0, 0}, /* 1 not used */ - { 0, 1024, 1024, 0, 0}, /* 2 */ - { 0, 512, 512, 4, 0}, /* 3 */ - { 0, 512, 512, 512, 512} /* 4 */ - }; #endif /* __SVM_NPT_H__ */ diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index 93f9489559..d32498092e 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -748,7 +748,7 @@ static void audit_p2m(struct domain *d) if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) ) { - lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type)); + lp2mfn = mfn_x(gfn_to_mfn(d, gfn, &type)); if ( lp2mfn != mfn_x(p2mfn) ) { P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " diff --git a/xen/arch/x86/mm/page-guest32.h b/xen/arch/x86/mm/page-guest32.h deleted file mode 100644 index 5d333bd91b..0000000000 --- a/xen/arch/x86/mm/page-guest32.h +++ /dev/null @@ -1,100 +0,0 @@ - -#ifndef __X86_PAGE_GUEST_H__ -#define __X86_PAGE_GUEST_H__ - -#ifndef __ASSEMBLY__ -# include <asm/types.h> -#endif - -#define PAGETABLE_ORDER_32 10 -#define L1_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32) -#define L2_PAGETABLE_ENTRIES_32 (1<<PAGETABLE_ORDER_32) -#define ROOT_PAGETABLE_ENTRIES_32 L2_PAGETABLE_ENTRIES_32 - - -#define L1_PAGETABLE_SHIFT_32 12 -#define L2_PAGETABLE_SHIFT_32 22 - -/* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */ - -#ifndef __ASSEMBLY__ - -typedef u32 intpte_32_t; - -typedef struct { intpte_32_t l1; } l1_pgentry_32_t; -typedef struct { intpte_32_t l2; } l2_pgentry_32_t; -typedef l2_pgentry_t root_pgentry_32_t; -#endif - -#define get_pte_flags_32(x) ((u32)(x) & 0xFFF) -#define put_pte_flags_32(x) ((intpte_32_t)(x)) - -/* Get pte access flags (unsigned int). */ -#define l1e_get_flags_32(x) (get_pte_flags_32((x).l1)) -#define l2e_get_flags_32(x) (get_pte_flags_32((x).l2)) - -#define l1e_get_paddr_32(x) \ - ((paddr_t)(((x).l1 & (PADDR_MASK&PAGE_MASK)))) -#define l2e_get_paddr_32(x) \ - ((paddr_t)(((x).l2 & (PADDR_MASK&PAGE_MASK)))) - -/* Construct an empty pte. */ -#define l1e_empty_32() ((l1_pgentry_32_t) { 0 }) -#define l2e_empty_32() ((l2_pgentry_32_t) { 0 }) - -/* Construct a pte from a pfn and access flags. */ -#define l1e_from_pfn_32(pfn, flags) \ - ((l1_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) }) -#define l2e_from_pfn_32(pfn, flags) \ - ((l2_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) }) - -/* Construct a pte from a physical address and access flags. */ -#ifndef __ASSEMBLY__ -static inline l1_pgentry_32_t l1e_from_paddr_32(paddr_t pa, unsigned int flags) -{ - ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0); - return (l1_pgentry_32_t) { pa | put_pte_flags_32(flags) }; -} -static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags) -{ - ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0); - return (l2_pgentry_32_t) { pa | put_pte_flags_32(flags) }; -} -#endif /* !__ASSEMBLY__ */ - - -/* Construct a pte from a page pointer and access flags. */ -#define l1e_from_page_32(page, flags) (l1e_from_pfn_32(page_to_mfn(page),(flags))) -#define l2e_from_page_32(page, flags) (l2e_from_pfn_32(page_to_mfn(page),(flags))) - -/* Add extra flags to an existing pte. */ -#define l1e_add_flags_32(x, flags) ((x).l1 |= put_pte_flags_32(flags)) -#define l2e_add_flags_32(x, flags) ((x).l2 |= put_pte_flags_32(flags)) - -/* Remove flags from an existing pte. */ -#define l1e_remove_flags_32(x, flags) ((x).l1 &= ~put_pte_flags_32(flags)) -#define l2e_remove_flags_32(x, flags) ((x).l2 &= ~put_pte_flags_32(flags)) - -/* Check if a pte's page mapping or significant access flags have changed. */ -#define l1e_has_changed_32(x,y,flags) \ - ( !!(((x).l1 ^ (y).l1) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) ) -#define l2e_has_changed_32(x,y,flags) \ - ( !!(((x).l2 ^ (y).l2) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) ) - -/* Given a virtual address, get an entry offset into a page table. */ -#define l1_table_offset_32(a) \ - (((a) >> L1_PAGETABLE_SHIFT_32) & (L1_PAGETABLE_ENTRIES_32 - 1)) -#define l2_table_offset_32(a) \ - (((a) >> L2_PAGETABLE_SHIFT_32) & (L2_PAGETABLE_ENTRIES_32 - 1)) - -#endif /* __X86_PAGE_GUEST_H__ */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c index f3ac8bfc39..2f2e3bf29d 100644 --- a/xen/arch/x86/mm/shadow/multi.c +++ b/xen/arch/x86/mm/shadow/multi.c @@ -35,6 +35,7 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/cacheattr.h> #include <asm/mtrr.h> +#include <asm/guest_pt.h> #include "private.h" #include "types.h" @@ -156,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) put_page(mfn_to_page(gmfn)); } -/**************************************************************************/ -/* CPU feature support querying */ - -static inline int -guest_supports_superpages(struct vcpu *v) -{ - /* The _PAGE_PSE bit must be honoured in HVM guests, whenever - * CR4.PSE is set or the guest is in PAE or long mode. - * It's also used in the dummy PT for vcpus with CR4.PG cleared. */ - return (is_hvm_vcpu(v) && - (GUEST_PAGING_LEVELS != 2 - || !hvm_paging_enabled(v) - || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE))); -} - -static inline int -guest_supports_nx(struct vcpu *v) -{ - if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx ) - return 0; - if ( !is_hvm_vcpu(v) ) - return cpu_has_nx; - return hvm_nx_enabled(v); -} - /**************************************************************************/ /* Functions for walking the guest page tables */ -/* Flags that are needed in a pagetable entry, with the sense of NX inverted */ -static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) -{ - static uint32_t flags[] = { - /* I/F - Usr Wr */ - /* 0 0 0 0 */ _PAGE_PRESENT, - /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW, - /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER, - /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, - /* 0 1 0 0 */ _PAGE_PRESENT, - /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW, - /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER, - /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, - /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, - /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, - /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, - /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, - /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, - /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, - /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, - /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, - }; - - /* Don't demand not-NX if the CPU wouldn't enforce it. */ - if ( !guest_supports_nx(v) ) - pfec &= ~PFEC_insn_fetch; - - /* Don't demand R/W if the CPU wouldn't enforce it. */ - if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) - && !(pfec & PFEC_user_mode) ) - pfec &= ~PFEC_write_access; - - return flags[(pfec & 0x1f) >> 1]; -} - -/* Modify a guest pagetable entry to set the Accessed and Dirty bits. - * Returns non-zero if it actually writes to guest memory. */ -static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) +static inline uint32_t +sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, + uint32_t pfec) { - guest_intpte_t old, new; - int ret = 0; - - old = *(guest_intpte_t *)walk_p; - new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0); - if ( old != new ) - { - /* Write the new entry into the walk, and try to write it back - * into the guest table as well. If the guest table has changed - * under out feet then leave it alone. */ - *(guest_intpte_t *)walk_p = new; - if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) - ret = 1; - - /* FIXME -- this code is longer than necessary */ - if(set_dirty) - TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD); - else - TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A); - } - return ret; + return guest_walk_tables(v, va, gw, pfec, +#if GUEST_PAGING_LEVELS == 3 /* PAE */ + _mfn(INVALID_MFN), + v->arch.paging.shadow.gl3e +#else /* 32 or 64 */ + pagetable_get_mfn(v->arch.guest_table), + v->arch.paging.shadow.guest_vtable +#endif + ); } /* This validation is called with lock held, and after write permission @@ -254,7 +183,7 @@ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) * Return 1 to indicate success and 0 for inconsistency */ static inline uint32_t -shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw) +shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version) { struct domain *d = v->domain; guest_l1e_t *l1p; @@ -267,9 +196,8 @@ shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw) ASSERT(shadow_locked_by_me(d)); - if ( gw->version == - atomic_read(&d->arch.paging.shadow.gtable_dirty_version) ) - return 1; + if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) ) + return 1; /* We may consider caching guest page mapping from last * guest table walk. However considering this check happens @@ -364,239 +292,6 @@ gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw) return rc; } -/* Walk the guest pagetables, after the manner of a hardware walker. - * - * Inputs: a vcpu, a virtual address, a walk_t to fill, a - * pointer to a pagefault code - * - * We walk the vcpu's guest pagetables, filling the walk_t with what we - * see and adding any Accessed and Dirty bits that are needed in the - * guest entries. Using the pagefault code, we check the permissions as - * we go. For the purposes of reading pagetables we treat all non-RAM - * memory as contining zeroes. - * - * The walk is done in a lock-free style, with some sanity check postponed - * after grabbing shadow lock later. Those delayed checks will make sure - * no inconsistent mapping being translated into shadow page table. - * - * Returns 0 for success, or the set of permission bits that we failed on - * if the walk did not complete. - * N.B. This is different from the old return code but almost no callers - * checked the old return code anyway. - */ -static uint32_t -guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec) -{ - struct domain *d = v->domain; - p2m_type_t p2mt; - guest_l1e_t *l1p = NULL; - guest_l2e_t *l2p = NULL; -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - guest_l3e_t *l3p = NULL; - guest_l4e_t *l4p; -#endif - uint32_t gflags, mflags, rc = 0; - int pse; - - perfc_incr(shadow_guest_walk); - memset(gw, 0, sizeof(*gw)); - gw->va = va; - - gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version); - rmb(); - - /* Mandatory bits that must be set in every entry. We invert NX, to - * calculate as if there were an "X" bit that allowed access. - * We will accumulate, in rc, the set of flags that are missing. */ - mflags = mandatory_flags(v, pfec); - -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - - /* Get the l4e from the top level table and check its flags*/ - gw->l4mfn = pagetable_get_mfn(v->arch.guest_table); - l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable); - gw->l4e = l4p[guest_l4_table_offset(va)]; - gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - if ( rc & _PAGE_PRESENT ) goto out; - - /* Map the l3 table */ - gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt); - if ( !p2m_is_ram(p2mt) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - ASSERT(mfn_valid(gw->l3mfn)); - - /* Get the l3e and check its flags*/ - l3p = sh_map_domain_page(gw->l3mfn); - gw->l3e = l3p[guest_l3_table_offset(va)]; - gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - if ( rc & _PAGE_PRESENT ) - goto out; - -#else /* PAE only... */ - - /* Get l3e from the cache of the top level table and check its flag */ - gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)]; - if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - -#endif /* PAE or 64... */ - - /* Map the l2 table */ - gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt); - if ( !p2m_is_ram(p2mt) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - ASSERT(mfn_valid(gw->l2mfn)); - - /* Get the l2e */ - l2p = sh_map_domain_page(gw->l2mfn); - gw->l2e = l2p[guest_l2_table_offset(va)]; - -#else /* 32-bit only... */ - - /* Get l2e from the top level table */ - gw->l2mfn = pagetable_get_mfn(v->arch.guest_table); - l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable); - gw->l2e = l2p[guest_l2_table_offset(va)]; - -#endif /* All levels... */ - - gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - if ( rc & _PAGE_PRESENT ) - goto out; - - pse = (guest_supports_superpages(v) && - (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); - - if ( pse ) - { - /* Special case: this guest VA is in a PSE superpage, so there's - * no guest l1e. We make one up so that the propagation code - * can generate a shadow l1 table. Start with the gfn of the - * first 4k-page of the superpage. */ - gfn_t start = guest_l2e_get_gfn(gw->l2e); - /* Grant full access in the l1e, since all the guest entry's - * access controls are enforced in the shadow l2e. */ - int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| - _PAGE_ACCESSED|_PAGE_DIRTY); - /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 - * of the level 1. */ - if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) - flags |= _PAGE_PAT; - /* Copy the cache-control bits to the l1 as well, because we - * can't represent PAT in the (non-PSE) shadow l2e. :( - * This could cause problems if a guest ever maps an area of - * memory with superpages using more than one caching mode. */ - flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD); - /* Increment the pfn by the right number of 4k pages. - * The ~0x1 is to mask out the PAT bit mentioned above. */ - start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); - gw->l1e = guest_l1e_from_gfn(start, flags); - gw->l1mfn = _mfn(INVALID_MFN); - } - else - { - /* Not a superpage: carry on and find the l1e. */ - gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt); - if ( !p2m_is_ram(p2mt) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - ASSERT(mfn_valid(gw->l1mfn)); - l1p = sh_map_domain_page(gw->l1mfn); - gw->l1e = l1p[guest_l1_table_offset(va)]; - gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - } - - /* Go back and set accessed and dirty bits only if the walk was a - * success. Although the PRMs say higher-level _PAGE_ACCESSED bits - * get set whenever a lower-level PT is used, at least some hardware - * walkers behave this way. */ - if ( rc == 0 ) - { -#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */ - if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) ) - paging_mark_dirty(d, mfn_x(gw->l4mfn)); - if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) ) - paging_mark_dirty(d, mfn_x(gw->l3mfn)); -#endif - if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e, - (pse && (pfec & PFEC_write_access))) ) - paging_mark_dirty(d, mfn_x(gw->l2mfn)); - if ( !pse ) - { - if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, - (pfec & PFEC_write_access)) ) - paging_mark_dirty(d, mfn_x(gw->l1mfn)); - } - } - - out: -#if GUEST_PAGING_LEVELS == 4 - if ( l3p ) sh_unmap_domain_page(l3p); -#endif -#if GUEST_PAGING_LEVELS >= 3 - if ( l2p ) sh_unmap_domain_page(l2p); -#endif - if ( l1p ) sh_unmap_domain_page(l1p); - - return rc; -} - -/* Given a walk_t, translate the gw->va into the guest's notion of the - * corresponding frame number. */ -static inline gfn_t -guest_walk_to_gfn(walk_t *gw) -{ - if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) - return _gfn(INVALID_GFN); - return guest_l1e_get_gfn(gw->l1e); -} - -/* Given a walk_t, translate the gw->va into the guest's notion of the - * corresponding physical address. */ -static inline paddr_t -guest_walk_to_gpa(walk_t *gw) -{ - if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) - return 0; - return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK); -} - -#if 0 /* Keep for debugging */ -/* Pretty-print the contents of a guest-walk */ -static inline void print_gw(walk_t *gw) -{ - SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va); -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn)); - SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4); - SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn)); -#endif /* PAE or 64... */ - SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3); -#endif /* All levels... */ - SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn)); - SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2); - SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn)); - SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1); -} -#endif /* 0 */ - #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES /* Lightweight audit: pass all the shadows associated with this guest walk * through the audit mechanisms */ @@ -657,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigned long addr, // XXX -- this is expensive, but it's easy to cobble together... // FIXME! - if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 + if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 && mfn_valid(gw.l1mfn) ) { if ( gl1mfn ) @@ -679,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e) // XXX -- this is expensive, but it's easy to cobble together... // FIXME! - (void) guest_walk_tables(v, addr, &gw, PFEC_page_present); + (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present); *(guest_l1e_t *)eff_l1e = gw.l1e; } #endif /* CONFIG == GUEST (== SHADOW) */ @@ -1171,9 +866,6 @@ static int shadow_set_l4e(struct vcpu *v, domain_crash(v->domain); return SHADOW_SET_ERROR; } -#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) - shadow_resync_all(v, 0); -#endif } /* Write the new entry */ @@ -1219,9 +911,6 @@ static int shadow_set_l3e(struct vcpu *v, domain_crash(v->domain); return SHADOW_SET_ERROR; } -#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) - shadow_resync_all(v, 0); -#endif } /* Write the new entry */ @@ -2021,7 +1710,8 @@ static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, walk_t *gw, mfn_t *sl3mfn, - fetch_type_t ft) + fetch_type_t ft, + int *resync) { mfn_t sl4mfn; shadow_l4e_t *sl4e; @@ -2051,6 +1741,11 @@ static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; + +#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC ) + *resync |= 1; +#endif + } /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va); @@ -2061,14 +1756,15 @@ static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, walk_t *gw, mfn_t *sl2mfn, - fetch_type_t ft) + fetch_type_t ft, + int *resync) { #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ mfn_t sl3mfn = _mfn(INVALID_MFN); shadow_l3e_t *sl3e; if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */ /* Get the l3e */ - sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft); + sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync); if ( sl3e == NULL ) return NULL; if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) { @@ -2100,6 +1796,11 @@ static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; + +#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC ) + *resync |= 1; +#endif + } /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); @@ -2132,11 +1833,13 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, fetch_type_t ft) { mfn_t sl2mfn; + int resync = 0; shadow_l2e_t *sl2e; /* Get the l2e */ - sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft); + sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync); if ( sl2e == NULL ) return NULL; + /* Install the sl1 in the l2e if it wasn't there or if we need to * re-do it to fix a PSE dirty bit. */ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT @@ -2182,6 +1885,14 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; + +#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC ) + /* All pages walked are now pagetables. Safe to resync pages + in case level 4 or 3 shadows were set. */ + if ( resync ) + shadow_resync_all(v, 0); +#endif + /* This next line is important: in 32-on-PAE and 32-on-64 modes, * the guest l1 table has an 8k shadow, and we need to return * the right mfn of the pair. This call will set it for us as a @@ -2463,6 +2174,10 @@ static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se) sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow); else result |= SHADOW_SET_ERROR; + +#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC ) + shadow_resync_all(v, 0); +#endif } l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch); @@ -2515,6 +2230,10 @@ static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se) sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow); else result |= SHADOW_SET_ERROR; + +#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC ) + shadow_resync_all(v, 0); +#endif } l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch); result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn); @@ -3173,6 +2892,7 @@ static int sh_page_fault(struct vcpu *v, fetch_type_t ft = 0; p2m_type_t p2mt; uint32_t rc; + int version; #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION int fast_emul = 0; #endif @@ -3316,7 +3036,14 @@ static int sh_page_fault(struct vcpu *v, } rewalk: - rc = guest_walk_tables(v, va, &gw, regs->error_code); + + /* The walk is done in a lock-free style, with some sanity check + * postponed after grabbing shadow lock later. Those delayed checks + * will make sure no inconsistent mapping being translated into + * shadow page table. */ + version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version); + rmb(); + rc = sh_walk_guest_tables(v, va, &gw, regs->error_code); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) regs->error_code &= ~PFEC_page_present; @@ -3392,7 +3119,7 @@ static int sh_page_fault(struct vcpu *v, } #endif /* OOS */ - if ( !shadow_check_gwalk(v, va, &gw) ) + if ( !shadow_check_gwalk(v, va, &gw, version) ) { perfc_incr(shadow_inconsistent_gwalk); shadow_unlock(d); @@ -3869,7 +3596,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec) return vtlb_gfn; #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ - if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 ) + if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 ) { if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) ) pfec[0] &= ~PFEC_page_present; diff --git a/xen/arch/x86/mm/shadow/types.h b/xen/arch/x86/mm/shadow/types.h index 440d2d31fb..f9f0554c47 100644 --- a/xen/arch/x86/mm/shadow/types.h +++ b/xen/arch/x86/mm/shadow/types.h @@ -191,169 +191,13 @@ static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags) }) #endif - -/* Type of the guest's frame numbers */ -TYPE_SAFE(unsigned long,gfn) -#define SH_PRI_gfn "05lx" - -#define VALID_GFN(m) (m != INVALID_GFN) - -static inline int -valid_gfn(gfn_t m) -{ - return VALID_GFN(gfn_x(m)); -} - -static inline paddr_t -gfn_to_paddr(gfn_t gfn) -{ - return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT; -} - -/* Override gfn_to_mfn to work with gfn_t */ -#undef gfn_to_mfn -#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), gfn_x(g), (t)) +/* The shadow types needed for the various levels. */ #if GUEST_PAGING_LEVELS == 2 - -#include "../page-guest32.h" - -#define GUEST_L1_PAGETABLE_ENTRIES 1024 -#define GUEST_L2_PAGETABLE_ENTRIES 1024 -#define GUEST_L1_PAGETABLE_SHIFT 12 -#define GUEST_L2_PAGETABLE_SHIFT 22 - -/* Types of the guest's page tables */ -typedef l1_pgentry_32_t guest_l1e_t; -typedef l2_pgentry_32_t guest_l2e_t; -typedef intpte_32_t guest_intpte_t; - -/* Access functions for them */ -static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) -{ return l1e_get_paddr_32(gl1e); } -static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) -{ return l2e_get_paddr_32(gl2e); } - -static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) -{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); } -static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) -{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); } - -static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) -{ return l1e_get_flags_32(gl1e); } -static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) -{ return l2e_get_flags_32(gl2e); } - -static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) -{ l1e_add_flags_32(gl1e, flags); return gl1e; } -static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) -{ l2e_add_flags_32(gl2e, flags); return gl2e; } - -static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) -{ return l1e_from_pfn_32(gfn_x(gfn), flags); } -static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) -{ return l2e_from_pfn_32(gfn_x(gfn), flags); } - -#define guest_l1_table_offset(a) l1_table_offset_32(a) -#define guest_l2_table_offset(a) l2_table_offset_32(a) - -/* The shadow types needed for the various levels. */ #define SH_type_l1_shadow SH_type_l1_32_shadow #define SH_type_l2_shadow SH_type_l2_32_shadow #define SH_type_fl1_shadow SH_type_fl1_32_shadow - -#else /* GUEST_PAGING_LEVELS != 2 */ - -#if GUEST_PAGING_LEVELS == 3 -#define GUEST_L1_PAGETABLE_ENTRIES 512 -#define GUEST_L2_PAGETABLE_ENTRIES 512 -#define GUEST_L3_PAGETABLE_ENTRIES 4 -#define GUEST_L1_PAGETABLE_SHIFT 12 -#define GUEST_L2_PAGETABLE_SHIFT 21 -#define GUEST_L3_PAGETABLE_SHIFT 30 -#else /* GUEST_PAGING_LEVELS == 4 */ -#define GUEST_L1_PAGETABLE_ENTRIES 512 -#define GUEST_L2_PAGETABLE_ENTRIES 512 -#define GUEST_L3_PAGETABLE_ENTRIES 512 -#define GUEST_L4_PAGETABLE_ENTRIES 512 -#define GUEST_L1_PAGETABLE_SHIFT 12 -#define GUEST_L2_PAGETABLE_SHIFT 21 -#define GUEST_L3_PAGETABLE_SHIFT 30 -#define GUEST_L4_PAGETABLE_SHIFT 39 -#endif - -/* Types of the guest's page tables */ -typedef l1_pgentry_t guest_l1e_t; -typedef l2_pgentry_t guest_l2e_t; -typedef l3_pgentry_t guest_l3e_t; -#if GUEST_PAGING_LEVELS >= 4 -typedef l4_pgentry_t guest_l4e_t; -#endif -typedef intpte_t guest_intpte_t; - -/* Access functions for them */ -static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) -{ return l1e_get_paddr(gl1e); } -static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) -{ return l2e_get_paddr(gl2e); } -static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e) -{ return l3e_get_paddr(gl3e); } -#if GUEST_PAGING_LEVELS >= 4 -static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e) -{ return l4e_get_paddr(gl4e); } -#endif - -static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) -{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); } -static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) -{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); } -static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e) -{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); } -#if GUEST_PAGING_LEVELS >= 4 -static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e) -{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); } -#endif - -static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) -{ return l1e_get_flags(gl1e); } -static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) -{ return l2e_get_flags(gl2e); } -static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e) -{ return l3e_get_flags(gl3e); } -#if GUEST_PAGING_LEVELS >= 4 -static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e) -{ return l4e_get_flags(gl4e); } -#endif - -static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags) -{ l1e_add_flags(gl1e, flags); return gl1e; } -static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags) -{ l2e_add_flags(gl2e, flags); return gl2e; } -static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags) -{ l3e_add_flags(gl3e, flags); return gl3e; } -#if GUEST_PAGING_LEVELS >= 4 -static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags) -{ l4e_add_flags(gl4e, flags); return gl4e; } -#endif - -static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) -{ return l1e_from_pfn(gfn_x(gfn), flags); } -static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) -{ return l2e_from_pfn(gfn_x(gfn), flags); } -static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags) -{ return l3e_from_pfn(gfn_x(gfn), flags); } -#if GUEST_PAGING_LEVELS >= 4 -static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags) -{ return l4e_from_pfn(gfn_x(gfn), flags); } -#endif - -#define guest_l1_table_offset(a) l1_table_offset(a) -#define guest_l2_table_offset(a) l2_table_offset(a) -#define guest_l3_table_offset(a) l3_table_offset(a) -#define guest_l4_table_offset(a) l4_table_offset(a) - -/* The shadow types needed for the various levels. */ -#if GUEST_PAGING_LEVELS == 3 +#elif GUEST_PAGING_LEVELS == 3 #define SH_type_l1_shadow SH_type_l1_pae_shadow #define SH_type_fl1_shadow SH_type_fl1_pae_shadow #define SH_type_l2_shadow SH_type_l2_pae_shadow @@ -367,35 +211,6 @@ static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags) #define SH_type_l4_shadow SH_type_l4_64_shadow #endif -#endif /* GUEST_PAGING_LEVELS != 2 */ - - -/* Type used for recording a walk through guest pagetables. It is - * filled in by the pagetable walk function, and also used as a cache - * for later walks. When we encounter a suporpage l2e, we fabricate an - * l1e for propagation to the shadow (for splintering guest superpages - * into many shadow l1 entries). */ -typedef struct shadow_walk_t walk_t; -struct shadow_walk_t -{ - unsigned long va; /* Address we were looking for */ -#if GUEST_PAGING_LEVELS >= 3 -#if GUEST_PAGING_LEVELS >= 4 - guest_l4e_t l4e; /* Guest's level 4 entry */ -#endif - guest_l3e_t l3e; /* Guest's level 3 entry */ -#endif - guest_l2e_t l2e; /* Guest's level 2 entry */ - guest_l1e_t l1e; /* Guest's level 1 entry (or fabrication) */ -#if GUEST_PAGING_LEVELS >= 4 - mfn_t l4mfn; /* MFN that the level 4 entry was in */ - mfn_t l3mfn; /* MFN that the level 3 entry was in */ -#endif - mfn_t l2mfn; /* MFN that the level 2 entry was in */ - mfn_t l1mfn; /* MFN that the level 1 entry was in */ - int version; /* Saved guest dirty version */ -}; - /* macros for dealing with the naming of the internal function names of the * shadow code's external entry points. */ @@ -460,17 +275,9 @@ struct shadow_walk_t #define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20) #endif -#define SH_PRI_pte PRIpte - -#if GUEST_PAGING_LEVELS == 2 -#define SH_PRI_gpte "08x" -#else /* GUEST_PAGING_LEVELS >= 3 */ -#ifndef __x86_64__ -#define SH_PRI_gpte "016llx" -#else -#define SH_PRI_gpte "016lx" -#endif -#endif /* GUEST_PAGING_LEVELS >= 3 */ +#define SH_PRI_pte PRIpte +#define SH_PRI_gpte PRI_gpte +#define SH_PRI_gfn PRI_gfn #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c index 8bf5b4a8e3..05a3e47d90 100644 --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -212,9 +212,9 @@ static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) entry->msg = *msg; } -void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +void set_msi_affinity(unsigned int vector, cpumask_t mask) { - struct msi_desc *desc = irq_desc[irq].msi_desc; + struct msi_desc *desc = irq_desc[vector].msi_desc; struct msi_msg msg; unsigned int dest; @@ -227,7 +227,7 @@ void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if ( !desc ) return; - ASSERT(spin_is_locked(&irq_desc[irq].lock)); + ASSERT(spin_is_locked(&irq_desc[vector].lock)); spin_lock(&desc->dev->lock); read_msi_msg(desc, &msg); @@ -276,9 +276,9 @@ static void msix_set_enable(struct pci_dev *dev, int enable) } } -static void msix_flush_writes(unsigned int irq) +static void msix_flush_writes(unsigned int vector) { - struct msi_desc *entry = irq_desc[irq].msi_desc; + struct msi_desc *entry = irq_desc[vector].msi_desc; BUG_ON(!entry || !entry->dev); switch (entry->msi_attrib.type) { @@ -298,11 +298,18 @@ static void msix_flush_writes(unsigned int irq) } } -static void msi_set_mask_bit(unsigned int irq, int flag) +int msi_maskable_irq(const struct msi_desc *entry) { - struct msi_desc *entry = irq_desc[irq].msi_desc; + BUG_ON(!entry); + return entry->msi_attrib.type != PCI_CAP_ID_MSI + || entry->msi_attrib.maskbit; +} - ASSERT(spin_is_locked(&irq_desc[irq].lock)); +static void msi_set_mask_bit(unsigned int vector, int flag) +{ + struct msi_desc *entry = irq_desc[vector].msi_desc; + + ASSERT(spin_is_locked(&irq_desc[vector].lock)); BUG_ON(!entry || !entry->dev); switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: @@ -318,8 +325,6 @@ static void msi_set_mask_bit(unsigned int irq, int flag) mask_bits &= ~(1); mask_bits |= flag; pci_conf_write32(bus, slot, func, pos, mask_bits); - } else { - msi_set_enable(entry->dev, !flag); } break; case PCI_CAP_ID_MSIX: @@ -337,16 +342,16 @@ static void msi_set_mask_bit(unsigned int irq, int flag) entry->msi_attrib.masked = !!flag; } -void mask_msi_irq(unsigned int irq) +void mask_msi_vector(unsigned int vector) { - msi_set_mask_bit(irq, 1); - msix_flush_writes(irq); + msi_set_mask_bit(vector, 1); + msix_flush_writes(vector); } -void unmask_msi_irq(unsigned int irq) +void unmask_msi_vector(unsigned int vector) { - msi_set_mask_bit(irq, 0); - msix_flush_writes(irq); + msi_set_mask_bit(vector, 0); + msix_flush_writes(vector); } static struct msi_desc* alloc_msi_entry(void) @@ -649,7 +654,7 @@ static int __pci_enable_msix(struct msi_info *msi) pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX); control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos)); nr_entries = multi_msix_capable(control); - if (msi->entry_nr > nr_entries) + if (msi->entry_nr >= nr_entries) { spin_unlock(&pdev->lock); return -EINVAL; diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 4bdae0fcd1..833f22e5ba 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -230,7 +230,6 @@ static void __init percpu_init_areas(void) static void __init init_idle_domain(void) { struct domain *idle_domain; - unsigned int i; /* Domain creation requires that scheduler structures are initialised. */ scheduler_init(); @@ -243,12 +242,6 @@ static void __init init_idle_domain(void) idle_vcpu[0] = this_cpu(curr_vcpu) = current; setup_idle_pagetable(); - - for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) - idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] = - l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i, - __PAGE_HYPERVISOR); - } static void __init srat_detect_node(int cpu) @@ -456,6 +449,7 @@ void __init __start_xen(unsigned long mbi_p) parse_video_info(); set_current((struct vcpu *)0xfffff000); /* debug sanity */ + idle_vcpu[0] = current; set_processor_id(0); /* needed early, for smp_processor_id() */ if ( cpu_has_efer ) rdmsrl(MSR_EFER, this_cpu(efer)); diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index 73c115d44a..2382fc3da7 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -821,7 +821,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu) */ { unsigned long boot_error; - unsigned int i; + unsigned int order; int timeout; unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; @@ -857,21 +857,21 @@ static int __devinit do_boot_cpu(int apicid, int cpu) gdt = per_cpu(gdt_table, cpu); if (gdt == boot_cpu_gdt_table) { - i = get_order_from_pages(NR_RESERVED_GDT_PAGES); + order = get_order_from_pages(NR_RESERVED_GDT_PAGES); #ifdef __x86_64__ #ifdef CONFIG_COMPAT - page = alloc_domheap_pages(NULL, i, + page = alloc_domheap_pages(NULL, order, MEMF_node(cpu_to_node(cpu))); per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page); memcpy(gdt, boot_cpu_compat_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE); gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; #endif - page = alloc_domheap_pages(NULL, i, + page = alloc_domheap_pages(NULL, order, MEMF_node(cpu_to_node(cpu))); per_cpu(gdt_table, cpu) = gdt = page_to_virt(page); #else - per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i); + per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order); #endif memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE); @@ -879,13 +879,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu) gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; } - for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i) - v->domain->arch.mm_perdomain_pt - [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) + - FIRST_RESERVED_GDT_PAGE + i] - = l1e_from_page(virt_to_page(gdt) + i, - __PAGE_HYPERVISOR); - #ifdef __i386__ if (!per_cpu(doublefault_tss, cpu)) { per_cpu(doublefault_tss, cpu) = alloc_xenheap_page(); diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index 8e86e7180c..037811fde8 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -56,9 +56,12 @@ struct cpu_time { }; struct platform_timesource { + char *id; char *name; u64 frequency; u64 (*read_counter)(void); + int (*init)(struct platform_timesource *); + void (*resume)(struct platform_timesource *); int counter_bits; }; @@ -360,15 +363,22 @@ static u64 read_pit_count(void) return count32; } -static void init_pit(struct platform_timesource *pts) +static int init_pit(struct platform_timesource *pts) { - pts->name = "PIT"; - pts->frequency = CLOCK_TICK_RATE; - pts->read_counter = read_pit_count; - pts->counter_bits = 32; using_pit = 1; + return 1; } +static struct platform_timesource plt_pit = +{ + .id = "pit", + .name = "PIT", + .frequency = CLOCK_TICK_RATE, + .read_counter = read_pit_count, + .counter_bits = 32, + .init = init_pit +}; + /************************************************************ * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET) */ @@ -385,14 +395,28 @@ static int init_hpet(struct platform_timesource *pts) if ( hpet_rate == 0 ) return 0; - pts->name = "HPET"; pts->frequency = hpet_rate; - pts->read_counter = read_hpet_count; - pts->counter_bits = 32; - return 1; } +static void resume_hpet(struct platform_timesource *pts) +{ + u64 hpet_rate = hpet_setup(); + + BUG_ON(hpet_rate == 0); + pts->frequency = hpet_rate; +} + +static struct platform_timesource plt_hpet = +{ + .id = "hpet", + .name = "HPET", + .read_counter = read_hpet_count, + .counter_bits = 32, + .init = init_hpet, + .resume = resume_hpet +}; + /************************************************************ * PLATFORM TIMER 3: IBM 'CYCLONE' TIMER */ @@ -440,20 +464,24 @@ static int init_cyclone(struct platform_timesource *pts) printk(KERN_ERR "Cyclone: Could not find valid CBAR value.\n"); return 0; } - + /* Enable timer and map the counter register. */ *(map_cyclone_reg(base + CYCLONE_PMCC_OFFSET)) = 1; *(map_cyclone_reg(base + CYCLONE_MPCS_OFFSET)) = 1; cyclone_timer = map_cyclone_reg(base + CYCLONE_MPMC_OFFSET); - - pts->name = "IBM Cyclone"; - pts->frequency = CYCLONE_TIMER_FREQ; - pts->read_counter = read_cyclone_count; - pts->counter_bits = 32; - return 1; } +static struct platform_timesource plt_cyclone = +{ + .id = "cyclone", + .name = "IBM Cyclone", + .frequency = CYCLONE_TIMER_FREQ, + .read_counter = read_cyclone_count, + .counter_bits = 32, + .init = init_cyclone +}; + /************************************************************ * PLATFORM TIMER 4: ACPI PM TIMER */ @@ -473,14 +501,19 @@ static int init_pmtimer(struct platform_timesource *pts) if ( pmtmr_ioport == 0 ) return 0; - pts->name = "ACPI PM Timer"; - pts->frequency = ACPI_PM_FREQUENCY; - pts->read_counter = read_pmtimer_count; - pts->counter_bits = 24; - return 1; } +static struct platform_timesource plt_pmtimer = +{ + .id = "acpi", + .name = "ACPI PM Timer", + .frequency = ACPI_PM_FREQUENCY, + .read_counter = read_pmtimer_count, + .counter_bits = 24, + .init = init_pmtimer +}; + /************************************************************ * GENERIC PLATFORM TIMER INFRASTRUCTURE */ @@ -548,26 +581,34 @@ static void platform_time_calibration(void) static void resume_platform_timer(void) { - /* No change in platform_stime across suspend/resume. */ - platform_timer_stamp = plt_stamp64; + /* Timer source can be reset when backing from S3 to S0 */ + if ( plt_src.resume ) + plt_src.resume(&plt_src); + + plt_stamp64 = platform_timer_stamp; plt_stamp = plt_src.read_counter(); } static void init_platform_timer(void) { - struct platform_timesource *pts = &plt_src; - int rc = -1; + static struct platform_timesource * const plt_timers[] = { + &plt_cyclone, &plt_hpet, &plt_pmtimer, &plt_pit + }; + + struct platform_timesource *pts = NULL; + int i, rc = -1; if ( opt_clocksource[0] != '\0' ) { - if ( !strcmp(opt_clocksource, "pit") ) - rc = (init_pit(pts), 1); - else if ( !strcmp(opt_clocksource, "hpet") ) - rc = init_hpet(pts); - else if ( !strcmp(opt_clocksource, "cyclone") ) - rc = init_cyclone(pts); - else if ( !strcmp(opt_clocksource, "acpi") ) - rc = init_pmtimer(pts); + for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ ) + { + pts = plt_timers[i]; + if ( !strcmp(opt_clocksource, pts->id) ) + { + rc = pts->init(pts); + break; + } + } if ( rc <= 0 ) printk("WARNING: %s clocksource '%s'.\n", @@ -575,11 +616,17 @@ static void init_platform_timer(void) opt_clocksource); } - if ( (rc <= 0) && - !init_cyclone(pts) && - !init_hpet(pts) && - !init_pmtimer(pts) ) - init_pit(pts); + if ( rc <= 0 ) + { + for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ ) + { + pts = plt_timers[i]; + if ( (rc = pts->init(pts)) > 0 ) + break; + } + } + + BUG_ON(rc <= 0); plt_mask = (u64)~0ull >> (64 - pts->counter_bits); @@ -588,6 +635,7 @@ static void init_platform_timer(void) plt_overflow_period = scale_delta( 1ull << (pts->counter_bits-1), &plt_scale); init_timer(&plt_overflow_timer, plt_overflow, NULL, 0); + plt_src = *pts; plt_overflow(NULL); platform_timer_stamp = plt_stamp64; @@ -1172,6 +1220,9 @@ int time_suspend(void) cmos_utc_offset = -get_cmos_time(); cmos_utc_offset += (wc_sec + (wc_nsec + NOW()) / 1000000000ULL); kill_timer(&calibration_timer); + + /* Sync platform timer stamps. */ + platform_time_calibration(); } /* Better to cancel calibration timer for accuracy. */ @@ -1184,19 +1235,18 @@ int time_resume(void) { /*u64 tmp = */init_pit_and_calibrate_tsc(); - disable_pit_irq(); - /* Disable this while calibrate_tsc_ap() also is skipped. */ /*set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);*/ resume_platform_timer(); + disable_pit_irq(); + init_percpu_time(); do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW()); - if ( !is_idle_vcpu(current) ) - update_vcpu_system_time(current); + update_vcpu_system_time(current); return 0; } diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index c4e9d30597..a315ea1c8e 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -723,7 +723,8 @@ static void pv_cpuid(struct cpu_user_regs *regs) { /* Modify Feature Information. */ __clear_bit(X86_FEATURE_VME, &d); - __clear_bit(X86_FEATURE_PSE, &d); + if ( !opt_allow_hugepage ) + __clear_bit(X86_FEATURE_PSE, &d); __clear_bit(X86_FEATURE_PGE, &d); __clear_bit(X86_FEATURE_MCE, &d); __clear_bit(X86_FEATURE_MCA, &d); @@ -754,6 +755,7 @@ static void pv_cpuid(struct cpu_user_regs *regs) __clear_bit(X86_FEATURE_XTPR % 32, &c); __clear_bit(X86_FEATURE_PDCM % 32, &c); __clear_bit(X86_FEATURE_DCA % 32, &c); + __set_bit(X86_FEATURE_HYPERVISOR % 32, &c); break; case 0x80000001: /* Modify Feature Information. */ @@ -2003,9 +2005,12 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) case 4: /* Read CR4 */ /* * Guests can read CR4 to see what features Xen has enabled. We - * therefore lie about PGE & PSE as they are unavailable to guests. + * therefore lie about PGE as it is unavailable to guests. + * Also disallow PSE if hugepages are not enabled. */ - *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE); + *reg = read_cr4() & ~X86_CR4_PGE; + if ( !opt_allow_hugepage ) + *reg &= ~X86_CR4_PSE; break; default: diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c index ea3c18ca88..d19f04f9f0 100644 --- a/xen/arch/x86/x86_32/mm.c +++ b/xen/arch/x86/x86_32/mm.c @@ -132,30 +132,6 @@ void __init setup_idle_pagetable(void) __PAGE_HYPERVISOR)); } -unsigned long clone_idle_pagetable(struct vcpu *v) -{ - unsigned int i; - struct domain *d = v->domain; - l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0]; - l2_pgentry_t *l2_table = alloc_xenheap_page(); - - if ( !l2_table ) - return 0; - - memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table)); - l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] = - l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT); - - copy_page(l2_table, idle_pg_table_l2 + - l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES); - for ( i = 0; i < PDPT_L2_ENTRIES; ++i ) - l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] = - l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i, - __PAGE_HYPERVISOR); - - return __pa(l3_table); -} - void __init zap_low_mappings(l2_pgentry_t *dom0_l2) { int i; diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 49ee4565e0..bc2302b6a3 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -21,7 +21,6 @@ #include <xen/lib.h> #include <xen/init.h> #include <xen/mm.h> -#include <xen/numa.h> #include <xen/sched.h> #include <xen/guest_access.h> #include <asm/current.h> @@ -207,24 +206,6 @@ void __init setup_idle_pagetable(void) __PAGE_HYPERVISOR)); } -unsigned long clone_idle_pagetable(struct vcpu *v) -{ - struct domain *d = v->domain; - struct page_info *page = alloc_domheap_page(NULL, - MEMF_node(vcpu_to_node(v))); - l4_pgentry_t *l4_table = page_to_virt(page); - - if ( !page ) - return 0; - - copy_page(l4_table, idle_pg_table); - l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] = - l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3), - __PAGE_HYPERVISOR); - - return __pa(l4_table); -} - void __init zap_low_mappings(void) { BUG_ON(num_online_cpus() != 1); diff --git a/xen/common/kernel.c b/xen/common/kernel.c index f3af91f2ed..9137671817 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -222,7 +222,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE(void) arg) #ifdef CONFIG_X86 if ( !is_hvm_vcpu(current) ) fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) | - (1U << XENFEAT_highmem_assist); + (1U << XENFEAT_highmem_assist) | + (1U << XENFEAT_gnttab_map_avail_bits); #endif break; default: diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c index 4229e2355f..fa41b210e0 100644 --- a/xen/drivers/passthrough/io.c +++ b/xen/drivers/passthrough/io.c @@ -62,7 +62,7 @@ int pt_irq_create_bind_vtd( struct dev_intx_gsi_link *digl; int pirq = pt_irq_bind->machine_irq; - if ( pirq < 0 || pirq >= NR_PIRQS ) + if ( pirq < 0 || pirq >= NR_IRQS ) return -EINVAL; spin_lock(&d->event_lock); @@ -261,7 +261,7 @@ void hvm_dpci_msi_eoi(struct domain *d, int vector) spin_lock(&d->event_lock); pirq = hvm_irq_dpci->msi_gvec_pirq[vector]; - if ( ( pirq >= 0 ) && (pirq < NR_PIRQS) && + if ( ( pirq >= 0 ) && (pirq < NR_IRQS) && test_bit(pirq, hvm_irq_dpci->mapping) && (test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags))) { diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c index 72c123ada4..f75a6132d7 100644 --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -171,9 +171,9 @@ static void pci_clean_dpci_irqs(struct domain *d) hvm_irq_dpci = domain_get_irq_dpci(d); if ( hvm_irq_dpci != NULL ) { - for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_PIRQS); - i < NR_PIRQS; - i = find_next_bit(hvm_irq_dpci->mapping, NR_PIRQS, i + 1) ) + for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_IRQS); + i < NR_IRQS; + i = find_next_bit(hvm_irq_dpci->mapping, NR_IRQS, i + 1) ) { pirq_guest_unbind(d, i); kill_timer(&hvm_irq_dpci->hvm_timer[irq_to_vector(i)]); diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c index 43107b3ae3..93531c65a2 100644 --- a/xen/drivers/passthrough/vtd/dmar.c +++ b/xen/drivers/passthrough/vtd/dmar.c @@ -351,7 +351,9 @@ acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header) if ( rmrr->base_address >= rmrr->end_address ) { - dprintk(XENLOG_ERR VTDPREFIX, "RMRR is incorrect.\n"); + dprintk(XENLOG_ERR VTDPREFIX, + "RMRR error: base_addr %"PRIx64" end_address %"PRIx64"\n", + rmrr->base_address, rmrr->end_address); return -EFAULT; } diff --git a/xen/drivers/passthrough/vtd/ia64/vtd.c b/xen/drivers/passthrough/vtd/ia64/vtd.c index 42e94f7ef5..b0abcf6929 100644 --- a/xen/drivers/passthrough/vtd/ia64/vtd.c +++ b/xen/drivers/passthrough/vtd/ia64/vtd.c @@ -21,6 +21,7 @@ #include <xen/sched.h> #include <xen/domain_page.h> #include <xen/iommu.h> +#include <xen/numa.h> #include <asm/xensystem.h> #include <asm/sal.h> #include "../iommu.h" @@ -44,12 +45,12 @@ void unmap_vtd_domain_page(void *va) } /* Allocate page table, return its machine address */ -u64 alloc_pgtable_maddr(void) +u64 alloc_pgtable_maddr(struct domain *d) { struct page_info *pg; u64 *vaddr; - pg = alloc_domheap_page(NULL, 0); + pg = alloc_domheap_page(NULL, d ? MEMF_node(domain_to_node(d)) : 0); vaddr = map_domain_page(page_to_mfn(pg)); if ( !vaddr ) return 0; diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c index 059ebf5a24..c9a73f50c4 100644 --- a/xen/drivers/passthrough/vtd/intremap.c +++ b/xen/drivers/passthrough/vtd/intremap.c @@ -30,6 +30,10 @@ #include "vtd.h" #include "extern.h" +#ifndef dest_SMI +#define dest_SMI -1 +#endif + u16 apicid_to_bdf(int apic_id) { struct acpi_drhd_unit *drhd = ioapic_to_drhd(apic_id); @@ -207,7 +211,7 @@ unsigned int io_apic_read_remap_rte( remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte; - if ( remap_rte->format == 0 ) + if ( (remap_rte->format == 0) || (old_rte.delivery_mode == dest_SMI) ) { *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg; return *(IO_APIC_BASE(apic)+4); @@ -253,6 +257,31 @@ void io_apic_write_remap_rte( remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte; + if ( old_rte.delivery_mode == dest_SMI ) + { + /* Some BIOS does not zero out reserve fields in IOAPIC + * RTE's. clear_IO_APIC() zeroes out all RTE's except for RTE + * with MSI delivery type. This is a problem when the host + * OS converts SMI delivery type to some other type but leaving + * the reserved field uninitialized. This can cause interrupt + * remapping table out of bound error if "format" field is 1 + * and the "index" field has a value that that is larger than + * the maximum index of interrupt remapping table. + */ + if ( remap_rte->format == 1 ) + { + remap_rte->format = 0; + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+0); + *IO_APIC_BASE(apic) = reg + 1; + *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+1); + } + + *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg; + *(IO_APIC_BASE(apic)+4) = value; + return; + } + /* mask the interrupt while we change the intremap table */ saved_mask = remap_rte->mask; remap_rte->mask = 1; @@ -473,7 +502,7 @@ int intremap_setup(struct iommu *iommu) ir_ctrl = iommu_ir_ctrl(iommu); if ( ir_ctrl->iremap_maddr == 0 ) { - ir_ctrl->iremap_maddr = alloc_pgtable_maddr(); + ir_ctrl->iremap_maddr = alloc_pgtable_maddr(NULL); if ( ir_ctrl->iremap_maddr == 0 ) { dprintk(XENLOG_WARNING VTDPREFIX, diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c index 28ce715c6b..2a3310fadf 100644 --- a/xen/drivers/passthrough/vtd/iommu.c +++ b/xen/drivers/passthrough/vtd/iommu.c @@ -148,7 +148,7 @@ static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus) root = &root_entries[bus]; if ( !root_present(*root) ) { - maddr = alloc_pgtable_maddr(); + maddr = alloc_pgtable_maddr(NULL); if ( maddr == 0 ) { unmap_vtd_domain_page(root_entries); @@ -205,7 +205,7 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc) addr &= (((u64)1) << addr_width) - 1; spin_lock_irqsave(&hd->mapping_lock, flags); if ( hd->pgd_maddr == 0 ) - if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr()) == 0) ) + if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain)) == 0) ) goto out; parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr); @@ -218,7 +218,7 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc) { if ( !alloc ) break; - maddr = alloc_pgtable_maddr(); + maddr = alloc_pgtable_maddr(domain); if ( !maddr ) break; dma_set_pte_addr(*pte, maddr); @@ -605,7 +605,7 @@ static int iommu_set_root_entry(struct iommu *iommu) spin_lock_irqsave(&iommu->register_lock, flags); if ( iommu->root_maddr == 0 ) - iommu->root_maddr = alloc_pgtable_maddr(); + iommu->root_maddr = alloc_pgtable_maddr(NULL); if ( iommu->root_maddr == 0 ) { spin_unlock_irqrestore(&iommu->register_lock, flags); @@ -634,7 +634,7 @@ static int iommu_set_root_entry(struct iommu *iommu) return 0; } -static int iommu_enable_translation(struct iommu *iommu) +static void iommu_enable_translation(struct iommu *iommu) { u32 sts; unsigned long flags; @@ -661,7 +661,6 @@ static int iommu_enable_translation(struct iommu *iommu) /* Disable PMRs when VT-d engine takes effect per spec definition */ disable_pmr(iommu); spin_unlock_irqrestore(&iommu->register_lock, flags); - return 0; } int iommu_disable_translation(struct iommu *iommu) @@ -1046,8 +1045,7 @@ static int intel_iommu_domain_init(struct domain *d) for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; - if ( iommu_enable_translation(iommu) ) - return -EIO; + iommu_enable_translation(iommu); } } @@ -1799,14 +1797,14 @@ static int intel_iommu_group_id(u8 bus, u8 devfn) } static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS]; -int iommu_suspend(void) +void iommu_suspend(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; u32 i; if ( !vtd_enabled ) - return 0; + return; iommu_flush_all(); @@ -1824,18 +1822,16 @@ int iommu_suspend(void) iommu_state[i][DMAR_FEUADDR_REG] = (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG); } - - return 0; } -int iommu_resume(void) +void iommu_resume(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; u32 i; if ( !vtd_enabled ) - return 0; + return; iommu_flush_all(); @@ -1855,12 +1851,8 @@ int iommu_resume(void) (u32) iommu_state[i][DMAR_FEADDR_REG]); dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32) iommu_state[i][DMAR_FEUADDR_REG]); - - if ( iommu_enable_translation(iommu) ) - return -EIO; + iommu_enable_translation(iommu); } - - return 0; } struct iommu_ops intel_iommu_ops = { diff --git a/xen/drivers/passthrough/vtd/qinval.c b/xen/drivers/passthrough/vtd/qinval.c index d90242d708..048089350d 100644 --- a/xen/drivers/passthrough/vtd/qinval.c +++ b/xen/drivers/passthrough/vtd/qinval.c @@ -426,7 +426,7 @@ int qinval_setup(struct iommu *iommu) if ( qi_ctrl->qinval_maddr == 0 ) { - qi_ctrl->qinval_maddr = alloc_pgtable_maddr(); + qi_ctrl->qinval_maddr = alloc_pgtable_maddr(NULL); if ( qi_ctrl->qinval_maddr == 0 ) { dprintk(XENLOG_WARNING VTDPREFIX, diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h index 84cd2e5f8a..ec02d129d8 100644 --- a/xen/drivers/passthrough/vtd/vtd.h +++ b/xen/drivers/passthrough/vtd/vtd.h @@ -101,7 +101,7 @@ unsigned int get_cache_line_size(void); void cacheline_flush(char *); void flush_all_cache(void); void *map_to_nocache_virt(int nr_iommus, u64 maddr); -u64 alloc_pgtable_maddr(void); +u64 alloc_pgtable_maddr(struct domain *d); void free_pgtable_maddr(u64 maddr); void *map_vtd_domain_page(u64 maddr); void unmap_vtd_domain_page(void *va); diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c index 83d7704256..31dc561881 100644 --- a/xen/drivers/passthrough/vtd/x86/vtd.c +++ b/xen/drivers/passthrough/vtd/x86/vtd.c @@ -22,6 +22,7 @@ #include <xen/domain_page.h> #include <asm/paging.h> #include <xen/iommu.h> +#include <xen/numa.h> #include "../iommu.h" #include "../dmar.h" #include "../vtd.h" @@ -37,13 +38,13 @@ void unmap_vtd_domain_page(void *va) } /* Allocate page table, return its machine address */ -u64 alloc_pgtable_maddr(void) +u64 alloc_pgtable_maddr(struct domain *d) { struct page_info *pg; u64 *vaddr; unsigned long mfn; - pg = alloc_domheap_page(NULL, 0); + pg = alloc_domheap_page(NULL, d ? MEMF_node(domain_to_node(d)) : 0); if ( !pg ) return 0; mfn = page_to_mfn(pg); @@ -121,9 +122,9 @@ void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq) return; } /* Multiple mirq may be mapped to one isa irq */ - for ( i = find_first_bit(dpci->mapping, NR_PIRQS); - i < NR_PIRQS; - i = find_next_bit(dpci->mapping, NR_PIRQS, i + 1) ) + for ( i = find_first_bit(dpci->mapping, NR_IRQS); + i < NR_IRQS; + i = find_next_bit(dpci->mapping, NR_IRQS, i + 1) ) { list_for_each_entry_safe ( digl, tmp, &dpci->mirq[i].digl_list, list ) diff --git a/xen/include/asm-ia64/hvm/irq.h b/xen/include/asm-ia64/hvm/irq.h index d163e56e36..32d0164101 100644 --- a/xen/include/asm-ia64/hvm/irq.h +++ b/xen/include/asm-ia64/hvm/irq.h @@ -24,9 +24,7 @@ #include <xen/irq.h> -#define NR_VECTORS 256 #define VIOAPIC_NUM_PINS 48 -#define NR_PIRQS 256 #include <xen/hvm/irq.h> diff --git a/xen/include/asm-ia64/linux/asm/irq.h b/xen/include/asm-ia64/linux/asm/irq.h index 687f49fefa..c0fd725c0d 100644 --- a/xen/include/asm-ia64/linux/asm/irq.h +++ b/xen/include/asm-ia64/linux/asm/irq.h @@ -11,6 +11,7 @@ * 02/29/00 D.Mosberger moved most things into hw_irq.h */ +#define NR_VECTORS 256 #define NR_IRQS 256 #define NR_IRQ_VECTORS NR_IRQS diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h index 5ae806faa9..459978e491 100644 --- a/xen/include/asm-x86/acpi.h +++ b/xen/include/asm-x86/acpi.h @@ -166,4 +166,7 @@ extern u8 x86_acpiid_to_apicid[]; extern int acpi_dmar_init(void); +/* Incremented whenever we transition through S3. Value is 1 during boot. */ +extern uint32_t system_reset_counter; + #endif /*__X86_ASM_ACPI_H*/ diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 815eb4a95d..aa9b234370 100644 --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -51,6 +51,12 @@ #define NR_CPUS 32 #endif +#ifdef MAX_PHYS_IRQS +#define NR_IRQS MAX_PHYS_IRQS +#else +#define NR_IRQS 256 +#endif + #if defined(__i386__) && (NR_CPUS > 32) #error "Maximum of 32 physical processors supported by Xen on x86_32" #endif diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h index 18f6aff015..481236c833 100644 --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -94,6 +94,7 @@ #define X86_FEATURE_SSE4_2 (4*32+20) /* Streaming SIMD Extensions 4.2 */ #define X86_FEATURE_X2APIC (4*32+21) /* Extended xAPIC */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */ diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 3acab04f27..1589615363 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -6,7 +6,6 @@ #include <asm/hvm/vcpu.h> #include <asm/hvm/domain.h> #include <asm/e820.h> -#include <asm/pirq.h> #define has_32bit_shinfo(d) ((d)->arch.has_32bit_shinfo) #define is_pv_32bit_domain(d) ((d)->arch.is_32bit_pv) @@ -237,7 +236,7 @@ struct arch_domain /* NB. protected by d->event_lock and by irq_desc[vector].lock */ int vector_pirq[NR_VECTORS]; - int pirq_vector[NR_PIRQS]; + s16 pirq_vector[NR_IRQS]; /* Pseudophysical e820 map (XENMEM_memory_map). */ struct e820entry e820[3]; diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h new file mode 100644 index 0000000000..d44e622ddc --- /dev/null +++ b/xen/include/asm-x86/guest_pt.h @@ -0,0 +1,291 @@ +/****************************************************************************** + * xen/asm-x86/guest_pt.h + * + * Types and accessors for guest pagetable entries, as distinct from + * Xen's pagetable types. + * + * Users must #define GUEST_PAGING_LEVELS to 2, 3 or 4 before including + * this file. + * + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _XEN_ASM_GUEST_PT_H +#define _XEN_ASM_GUEST_PT_H + +/* Type of the guest's frame numbers */ +TYPE_SAFE(unsigned long,gfn) +#define PRI_gfn "05lx" + +#define VALID_GFN(m) (m != INVALID_GFN) + +static inline int +valid_gfn(gfn_t m) +{ + return VALID_GFN(gfn_x(m)); +} + +static inline paddr_t +gfn_to_paddr(gfn_t gfn) +{ + return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT; +} + +/* Override gfn_to_mfn to work with gfn_t */ +#undef gfn_to_mfn +#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), gfn_x(g), (t)) + + +/* Types of the guest's page tables and access functions for them */ + +#if GUEST_PAGING_LEVELS == 2 + +#define GUEST_L1_PAGETABLE_ENTRIES 1024 +#define GUEST_L2_PAGETABLE_ENTRIES 1024 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 22 + +typedef uint32_t guest_intpte_t; +typedef struct { guest_intpte_t l1; } guest_l1e_t; +typedef struct { guest_intpte_t l2; } guest_l2e_t; + +#define PRI_gpte "08x" + +static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) +{ return ((paddr_t) gl1e.l1) & (PADDR_MASK & PAGE_MASK); } +static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) +{ return ((paddr_t) gl2e.l2) & (PADDR_MASK & PAGE_MASK); } + +static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) +{ return _gfn(guest_l1e_get_paddr(gl1e) >> PAGE_SHIFT); } +static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) +{ return _gfn(guest_l2e_get_paddr(gl2e) >> PAGE_SHIFT); } + +static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) +{ return gl1e.l1 & 0xfff; } +static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) +{ return gl2e.l2 & 0xfff; } + +static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) +{ return (guest_l1e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; } +static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) +{ return (guest_l2e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; } + +#define guest_l1_table_offset(_va) \ + (((_va) >> GUEST_L1_PAGETABLE_SHIFT) & (GUEST_L1_PAGETABLE_ENTRIES - 1)) +#define guest_l2_table_offset(_va) \ + (((_va) >> GUEST_L2_PAGETABLE_SHIFT) & (GUEST_L2_PAGETABLE_ENTRIES - 1)) + +#else /* GUEST_PAGING_LEVELS != 2 */ + +#if GUEST_PAGING_LEVELS == 3 +#define GUEST_L1_PAGETABLE_ENTRIES 512 +#define GUEST_L2_PAGETABLE_ENTRIES 512 +#define GUEST_L3_PAGETABLE_ENTRIES 4 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 21 +#define GUEST_L3_PAGETABLE_SHIFT 30 +#else /* GUEST_PAGING_LEVELS == 4 */ +#define GUEST_L1_PAGETABLE_ENTRIES 512 +#define GUEST_L2_PAGETABLE_ENTRIES 512 +#define GUEST_L3_PAGETABLE_ENTRIES 512 +#define GUEST_L4_PAGETABLE_ENTRIES 512 +#define GUEST_L1_PAGETABLE_SHIFT 12 +#define GUEST_L2_PAGETABLE_SHIFT 21 +#define GUEST_L3_PAGETABLE_SHIFT 30 +#define GUEST_L4_PAGETABLE_SHIFT 39 +#endif + +typedef l1_pgentry_t guest_l1e_t; +typedef l2_pgentry_t guest_l2e_t; +typedef l3_pgentry_t guest_l3e_t; +#if GUEST_PAGING_LEVELS >= 4 +typedef l4_pgentry_t guest_l4e_t; +#endif +typedef intpte_t guest_intpte_t; + +#define PRI_gpte "016"PRIx64 + +static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) +{ return l1e_get_paddr(gl1e); } +static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) +{ return l2e_get_paddr(gl2e); } +static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e) +{ return l3e_get_paddr(gl3e); } +#if GUEST_PAGING_LEVELS >= 4 +static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e) +{ return l4e_get_paddr(gl4e); } +#endif + +static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) +{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); } +static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) +{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); } +static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e) +{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); } +#if GUEST_PAGING_LEVELS >= 4 +static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e) +{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); } +#endif + +static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) +{ return l1e_get_flags(gl1e); } +static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) +{ return l2e_get_flags(gl2e); } +static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e) +{ return l3e_get_flags(gl3e); } +#if GUEST_PAGING_LEVELS >= 4 +static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e) +{ return l4e_get_flags(gl4e); } +#endif + +static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) +{ return l1e_from_pfn(gfn_x(gfn), flags); } +static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) +{ return l2e_from_pfn(gfn_x(gfn), flags); } +static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags) +{ return l3e_from_pfn(gfn_x(gfn), flags); } +#if GUEST_PAGING_LEVELS >= 4 +static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags) +{ return l4e_from_pfn(gfn_x(gfn), flags); } +#endif + +#define guest_l1_table_offset(a) l1_table_offset(a) +#define guest_l2_table_offset(a) l2_table_offset(a) +#define guest_l3_table_offset(a) l3_table_offset(a) +#define guest_l4_table_offset(a) l4_table_offset(a) + +#endif /* GUEST_PAGING_LEVELS != 2 */ + + +/* Which pagetable features are supported on this vcpu? */ + +static inline int +guest_supports_superpages(struct vcpu *v) +{ + /* The _PAGE_PSE bit must be honoured in HVM guests, whenever + * CR4.PSE is set or the guest is in PAE or long mode. + * It's also used in the dummy PT for vcpus with CR4.PG cleared. */ + return (is_hvm_vcpu(v) && + (GUEST_PAGING_LEVELS != 2 + || !hvm_paging_enabled(v) + || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE))); +} + +static inline int +guest_supports_nx(struct vcpu *v) +{ + if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx ) + return 0; + if ( !is_hvm_vcpu(v) ) + return cpu_has_nx; + return hvm_nx_enabled(v); +} + + + +/* Type used for recording a walk through guest pagetables. It is + * filled in by the pagetable walk function, and also used as a cache + * for later walks. When we encounter a superpage l2e, we fabricate an + * l1e for propagation to the shadow (for splintering guest superpages + * into many shadow l1 entries). */ +typedef struct guest_pagetable_walk walk_t; +struct guest_pagetable_walk +{ + unsigned long va; /* Address we were looking for */ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + guest_l4e_t l4e; /* Guest's level 4 entry */ +#endif + guest_l3e_t l3e; /* Guest's level 3 entry */ +#endif + guest_l2e_t l2e; /* Guest's level 2 entry */ + guest_l1e_t l1e; /* Guest's level 1 entry (or fabrication) */ +#if GUEST_PAGING_LEVELS >= 4 + mfn_t l4mfn; /* MFN that the level 4 entry was in */ + mfn_t l3mfn; /* MFN that the level 3 entry was in */ +#endif + mfn_t l2mfn; /* MFN that the level 2 entry was in */ + mfn_t l1mfn; /* MFN that the level 1 entry was in */ +}; + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding frame number. */ +static inline gfn_t +guest_walk_to_gfn(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) + return _gfn(INVALID_GFN); + return guest_l1e_get_gfn(gw->l1e); +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding physical address. */ +static inline paddr_t +guest_walk_to_gpa(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) + return 0; + return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK); +} + +/* Walk the guest pagetables, after the manner of a hardware walker. + * + * Inputs: a vcpu, a virtual address, a walk_t to fill, a + * pointer to a pagefault code, the MFN of the guest's + * top-level pagetable, and a mapping of the + * guest's top-level pagetable. + * + * We walk the vcpu's guest pagetables, filling the walk_t with what we + * see and adding any Accessed and Dirty bits that are needed in the + * guest entries. Using the pagefault code, we check the permissions as + * we go. For the purposes of reading pagetables we treat all non-RAM + * memory as contining zeroes. + * + * Returns 0 for success, or the set of permission bits that we failed on + * if the walk did not complete. */ + +/* Macro-fu so you can call guest_walk_tables() and get the right one. */ +#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels +#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l) +#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS) + +extern uint32_t +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, + uint32_t pfec, mfn_t top_mfn, void *top_map); + +/* Pretty-print the contents of a guest-walk */ +static inline void print_gw(walk_t *gw) +{ + gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va); +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + gdprintk(XENLOG_INFO, " l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn)); + gdprintk(XENLOG_INFO, " l4e=%" PRI_gpte "\n", gw->l4e.l4); + gdprintk(XENLOG_INFO, " l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn)); +#endif /* PAE or 64... */ + gdprintk(XENLOG_INFO, " l3e=%" PRI_gpte "\n", gw->l3e.l3); +#endif /* All levels... */ + gdprintk(XENLOG_INFO, " l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn)); + gdprintk(XENLOG_INFO, " l2e=%" PRI_gpte "\n", gw->l2e.l2); + gdprintk(XENLOG_INFO, " l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn)); + gdprintk(XENLOG_INFO, " l1e=%" PRI_gpte "\n", gw->l1e.l1); +} + +#endif /* _XEN_ASM_GUEST_PT_H */ diff --git a/xen/include/asm-x86/hpet.h b/xen/include/asm-x86/hpet.h index b63f56805d..82c08bc3c5 100644 --- a/xen/include/asm-x86/hpet.h +++ b/xen/include/asm-x86/hpet.h @@ -24,6 +24,10 @@ #define HPET_T2_CMP 0x148 #define HPET_T2_ROUTE 0x150 +#define HPET_Tn_CFG(n) (HPET_T0_CFG + n * 0x20) +#define HPET_Tn_CMP(n) (HPET_T0_CMP + n * 0x20) +#define HPET_Tn_ROUTE(n) (HPET_T0_ROUTE + n * 0x20) + #define HPET_ID_VENDOR 0xffff0000 #define HPET_ID_LEGSUP 0x00008000 #define HPET_ID_NUMBER 0x00001f00 diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h index f2220526fd..1f23124279 100644 --- a/xen/include/asm-x86/hvm/irq.h +++ b/xen/include/asm-x86/hvm/irq.h @@ -22,7 +22,6 @@ #ifndef __ASM_X86_HVM_IRQ_H__ #define __ASM_X86_HVM_IRQ_H__ -#include <asm/pirq.h> #include <xen/hvm/irq.h> #include <asm/hvm/hvm.h> #include <asm/hvm/vpic.h> diff --git a/xen/include/asm-x86/hvm/vlapic.h b/xen/include/asm-x86/hvm/vlapic.h index 3f34e47950..8c36ed5a00 100644 --- a/xen/include/asm-x86/hvm/vlapic.h +++ b/xen/include/asm-x86/hvm/vlapic.h @@ -93,8 +93,7 @@ void vlapic_msr_set(struct vlapic *vlapic, uint64_t value); int vlapic_accept_pic_intr(struct vcpu *v); -struct vlapic *apic_round_robin( - struct domain *d, uint8_t vector, uint32_t bitmap); +struct vlapic *apic_lowest_prio(struct domain *d, uint32_t bitmap); int vlapic_match_logical_addr(struct vlapic *vlapic, uint8_t mda); diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index 0430a46c1b..563e8aea36 100644 --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -351,9 +351,9 @@ static inline int __vmxon(u64 addr) return rc; } -void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code); -void vmx_inject_extint(struct vcpu *v, int trap); -void vmx_inject_nmi(struct vcpu *v); +void vmx_inject_hw_exception(int trap, int error_code); +void vmx_inject_extint(int trap); +void vmx_inject_nmi(void); void ept_p2m_init(struct domain *d); diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h index 982f99f3c4..920ac7f85e 100644 --- a/xen/include/asm-x86/irq.h +++ b/xen/include/asm-x86/irq.h @@ -18,7 +18,7 @@ #define vector_to_irq(vec) (vector_irq[vec]) extern int vector_irq[NR_VECTORS]; -extern u8 irq_vector[NR_IRQ_VECTORS]; +extern u8 irq_vector[NR_IRQS]; #define AUTO_ASSIGN -1 #define NEVER_ASSIGN -2 #define FREE_TO_ASSIGN -3 diff --git a/xen/include/asm-x86/mach-default/irq_vectors.h b/xen/include/asm-x86/mach-default/irq_vectors.h index 90b4e1ef0e..057b2a35b8 100644 --- a/xen/include/asm-x86/mach-default/irq_vectors.h +++ b/xen/include/asm-x86/mach-default/irq_vectors.h @@ -30,8 +30,4 @@ #define NR_VECTORS 256 -/* Limited by number of trap vectors. */ -#define NR_IRQS NR_VECTORS -#define NR_IRQ_VECTORS NR_IRQS - #endif /* _ASM_IRQ_VECTORS_H */ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 005b6603e2..d017c4cb56 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -263,6 +263,7 @@ pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab); int check_descriptor(const struct domain *, struct desc_struct *d); +extern int opt_allow_hugepage; /****************************************************************************** * With shadow pagetables, the different kinds of address start diff --git a/xen/include/asm-x86/msi.h b/xen/include/asm-x86/msi.h index c72f9d69c5..6ca1a76898 100644 --- a/xen/include/asm-x86/msi.h +++ b/xen/include/asm-x86/msi.h @@ -69,9 +69,9 @@ struct msi_msg { }; /* Helper functions */ -extern void mask_msi_irq(unsigned int irq); -extern void unmask_msi_irq(unsigned int irq); -extern void set_msi_irq_affinity(unsigned int irq, cpumask_t mask); +extern void mask_msi_vector(unsigned int vector); +extern void unmask_msi_vector(unsigned int vector); +extern void set_msi_affinity(unsigned int vector, cpumask_t mask); extern int pci_enable_msi(struct msi_info *msi); extern void pci_disable_msi(int vector); extern void pci_cleanup_msi(struct pci_dev *pdev); @@ -97,6 +97,8 @@ struct msi_desc { int remap_index; /* index in interrupt remapping table */ }; +int msi_maskable_irq(const struct msi_desc *); + /* * Assume the maximum number of hot plug slots supported by the system is about * ten. The worstcase is that each of these slots is hot-added with a device, diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index e17a9469e2..9ccfdb8502 100644 --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -215,7 +215,10 @@ void clear_page_sse2(void *); #define clear_page(_p) (cpu_has_xmm2 ? \ clear_page_sse2((void *)(_p)) : \ (void)memset((void *)(_p), 0, PAGE_SIZE)) -#define copy_page(_t,_f) memcpy((void *)(_t), (void *)(_f), PAGE_SIZE) +void copy_page_sse2(void *, const void *); +#define copy_page(_t,_f) (cpu_has_xmm2 ? \ + copy_page_sse2(_t, _f) : \ + (void)memcpy(_t, _f, PAGE_SIZE)) #define mfn_valid(mfn) ((mfn) < max_page) @@ -278,7 +281,6 @@ extern unsigned int m2p_compat_vstart; #endif void paging_init(void); void setup_idle_pagetable(void); -unsigned long clone_idle_pagetable(struct vcpu *); #endif /* !defined(__ASSEMBLY__) */ #define _PAGE_PRESENT 0x001U diff --git a/xen/include/asm-x86/perfc_defn.h b/xen/include/asm-x86/perfc_defn.h index 784aa9eb5a..47739fa19b 100644 --- a/xen/include/asm-x86/perfc_defn.h +++ b/xen/include/asm-x86/perfc_defn.h @@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations, "writable pt emulations") PERFCOUNTER(exception_fixed, "pre-exception fixed") +PERFCOUNTER(guest_walk, "guest pagetable walks") /* Shadow counters */ PERFCOUNTER(shadow_alloc, "calls to shadow_alloc") @@ -92,7 +93,6 @@ PERFCOUNTER(shadow_unshadow, "shadow unshadows a page") PERFCOUNTER(shadow_up_pointer, "shadow unshadow by up-pointer") PERFCOUNTER(shadow_unshadow_bf, "shadow unshadow brute-force") PERFCOUNTER(shadow_get_page_fail, "shadow_get_page_from_l1e failed") -PERFCOUNTER(shadow_guest_walk, "shadow walks guest tables") PERFCOUNTER(shadow_check_gwalk, "shadow checks gwalk") PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk") PERFCOUNTER(shadow_rm_write_flush_tlb, diff --git a/xen/include/asm-x86/pirq.h b/xen/include/asm-x86/pirq.h deleted file mode 100644 index 2041262134..0000000000 --- a/xen/include/asm-x86/pirq.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef __XEN_PIRQ_H -#define __XEN_PIRQ_H - -#define PIRQ_BASE 0 -#define NR_PIRQS 256 - -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) -#define NR_DYNIRQS 256 - -#endif /* __XEN_PIRQ_H */ - diff --git a/xen/include/asm-x86/x86_32/page.h b/xen/include/asm-x86/x86_32/page.h index 16659a1ae3..aef51f51af 100644 --- a/xen/include/asm-x86/x86_32/page.h +++ b/xen/include/asm-x86/x86_32/page.h @@ -112,7 +112,7 @@ extern unsigned int PAGE_HYPERVISOR_NOCACHE; #define BASE_DISALLOW_MASK (0xFFFFF198U & ~_PAGE_NX) #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB) -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK) +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE) #define L3_DISALLOW_MASK 0xFFFFF1FEU /* must-be-zero */ #endif /* __X86_32_PAGE_H__ */ diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h index 948cd656f0..ac44a9a1c1 100644 --- a/xen/include/asm-x86/x86_64/page.h +++ b/xen/include/asm-x86/x86_64/page.h @@ -115,7 +115,7 @@ typedef l4_pgentry_t root_pgentry_t; #define BASE_DISALLOW_MASK (0xFF800198U & ~_PAGE_NX) #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB) -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK) +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE) #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK) #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK) diff --git a/xen/include/public/features.h b/xen/include/public/features.h index 16e5ee4d49..879131cda1 100644 --- a/xen/include/public/features.h +++ b/xen/include/public/features.h @@ -62,6 +62,12 @@ /* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */ #define XENFEAT_highmem_assist 6 +/* + * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel + * available pte bits. + */ +#define XENFEAT_gnttab_map_avail_bits 7 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ diff --git a/xen/include/public/grant_table.h b/xen/include/public/grant_table.h index 26f2c35b18..ad116e71e1 100644 --- a/xen/include/public/grant_table.h +++ b/xen/include/public/grant_table.h @@ -360,7 +360,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t); /* - * Bitfield values for update_pin_status.flags. + * Bitfield values for gnttab_map_grant_ref.flags. */ /* Map the grant entry for access by I/O devices. */ #define _GNTMAP_device_map (0) @@ -388,6 +388,13 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t); #define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte) /* + * Bits to be placed in guest kernel available PTE bits (architecture + * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set). + */ +#define _GNTMAP_guest_avail0 (16) +#define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0) + +/* * Values for error status returns. All errors are -ve. */ #define GNTST_okay (0) /* Normal return. */ diff --git a/xen/include/public/io/pciif.h b/xen/include/public/io/pciif.h index 0a0ffcc6e2..7e75392599 100644 --- a/xen/include/public/io/pciif.h +++ b/xen/include/public/io/pciif.h @@ -30,14 +30,22 @@ /* xen_pci_sharedinfo flags */ #define _XEN_PCIF_active (0) #define XEN_PCIF_active (1<<_XEN_PCI_active) +#define _XEN_PCIB_AERHANDLER (1) +#define XEN_PCIB_AERHANDLER (1<<_XEN_PCIB_AERHANDLER) +#define _XEN_PCIB_active (2) +#define XEN_PCIB_active (1<<_XEN_PCIB_active) /* xen_pci_op commands */ -#define XEN_PCI_OP_conf_read (0) -#define XEN_PCI_OP_conf_write (1) -#define XEN_PCI_OP_enable_msi (2) -#define XEN_PCI_OP_disable_msi (3) -#define XEN_PCI_OP_enable_msix (4) -#define XEN_PCI_OP_disable_msix (5) +#define XEN_PCI_OP_conf_read (0) +#define XEN_PCI_OP_conf_write (1) +#define XEN_PCI_OP_enable_msi (2) +#define XEN_PCI_OP_disable_msi (3) +#define XEN_PCI_OP_enable_msix (4) +#define XEN_PCI_OP_disable_msix (5) +#define XEN_PCI_OP_aer_detected (6) +#define XEN_PCI_OP_aer_resume (7) +#define XEN_PCI_OP_aer_mmio (8) +#define XEN_PCI_OP_aer_slotreset (9) /* xen_pci_op error numbers */ #define XEN_PCI_ERR_success (0) @@ -82,10 +90,25 @@ struct xen_pci_op { struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC]; }; +/*used for pcie aer handling*/ +struct xen_pcie_aer_op +{ + + /* IN: what action to perform: XEN_PCI_OP_* */ + uint32_t cmd; + /*IN/OUT: return aer_op result or carry error_detected state as input*/ + int32_t err; + + /* IN: which device to touch */ + uint32_t domain; /* PCI Domain/Segment*/ + uint32_t bus; + uint32_t devfn; +}; struct xen_pci_sharedinfo { /* flags - XEN_PCIF_* */ uint32_t flags; struct xen_pci_op op; + struct xen_pcie_aer_op aer_op; }; #endif /* __XEN_PCI_COMMON_H__ */ diff --git a/xen/include/public/kexec.h b/xen/include/public/kexec.h index fc19f2fe50..04252226a1 100644 --- a/xen/include/public/kexec.h +++ b/xen/include/public/kexec.h @@ -155,27 +155,6 @@ typedef struct xen_kexec_range { unsigned long start; } xen_kexec_range_t; -/* vmcoreinfo stuff */ -#define VMCOREINFO_BYTES (4096) -#define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN" -void arch_crash_save_vmcoreinfo(void); -void vmcoreinfo_append_str(const char *fmt, ...) - __attribute__ ((format (printf, 1, 2))); -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #alias, \ - (unsigned long)offsetof(struct name, field)) - #endif /* _XEN_PUBLIC_KEXEC_H */ /* diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h index e77239f290..a89e2e9f88 100644 --- a/xen/include/xen/hvm/irq.h +++ b/xen/include/xen/hvm/irq.h @@ -63,7 +63,7 @@ struct hvm_girq_dpci_mapping { /* Protected by domain's event_lock */ struct hvm_irq_dpci { /* Machine IRQ to guest device/intx mapping. */ - DECLARE_BITMAP(mapping, NR_PIRQS); + DECLARE_BITMAP(mapping, NR_IRQS); struct hvm_mirq_dpci_mapping mirq[NR_IRQS]; /* Guest IRQ to guest device/intx mapping. */ struct hvm_girq_dpci_mapping girq[NR_IRQS]; diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h index 7d58109ec2..3997b2f96a 100644 --- a/xen/include/xen/hypercall.h +++ b/xen/include/xen/hypercall.h @@ -124,6 +124,12 @@ compat_memory_op( unsigned int cmd, XEN_GUEST_HANDLE(void) arg); +extern int +compat_vcpu_op( + int cmd, + int vcpuid, + XEN_GUEST_HANDLE(void) arg); + #endif #endif /* __XEN_HYPERCALL_H__ */ diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h index f230df7b8e..d68b41200c 100644 --- a/xen/include/xen/iommu.h +++ b/xen/include/xen/iommu.h @@ -110,7 +110,7 @@ struct iommu_ops { void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value); void iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg); -int iommu_suspend(void); -int iommu_resume(void); +void iommu_suspend(void); +void iommu_resume(void); #endif /* _IOMMU_H_ */ diff --git a/xen/include/xen/irq.h b/xen/include/xen/irq.h index a4dd3f6333..5b88079ddb 100644 --- a/xen/include/xen/irq.h +++ b/xen/include/xen/irq.h @@ -61,7 +61,7 @@ typedef struct { cpumask_t affinity; } __cacheline_aligned irq_desc_t; -extern irq_desc_t irq_desc[NR_IRQS]; +extern irq_desc_t irq_desc[NR_VECTORS]; extern int setup_irq(unsigned int, struct irqaction *); extern void free_irq(unsigned int); @@ -81,13 +81,16 @@ extern void pirq_guest_unbind(struct domain *d, int irq); extern irq_desc_t *domain_spin_lock_irq_desc( struct domain *d, int irq, unsigned long *pflags); -static inline void set_native_irq_info(int irq, cpumask_t mask) +static inline void set_native_irq_info(unsigned int vector, cpumask_t mask) { - irq_desc[irq].affinity = mask; + irq_desc[vector].affinity = mask; } +#ifdef irq_to_vector static inline void set_irq_info(int irq, cpumask_t mask) { - set_native_irq_info(irq, mask); + set_native_irq_info(irq_to_vector(irq), mask); } +#endif + #endif /* __XEN_IRQ_H__ */ diff --git a/xen/include/xen/kexec.h b/xen/include/xen/kexec.h index 9dc3dacac0..d78510e639 100644 --- a/xen/include/xen/kexec.h +++ b/xen/include/xen/kexec.h @@ -33,6 +33,27 @@ crash_xen_info_t *kexec_crash_save_info(void); void machine_crash_shutdown(void); int machine_kexec_get(xen_kexec_range_t *range); +/* vmcoreinfo stuff */ +#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN" +void arch_crash_save_vmcoreinfo(void); +void vmcoreinfo_append_str(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #alias, \ + (unsigned long)offsetof(struct name, field)) + #endif /* __XEN_KEXEC_H__ */ /* |