38 files changed, 1814 insertions, 1016 deletions
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 086a7b530f..334a996eb6 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -31,6 +31,7 @@ obj-y += mm.o
 obj-y += mpparse.o
 obj-y += nmi.o
 obj-y += numa.o
+obj-y += pci.o
 obj-y += physdev.o
 obj-y += rwlock.o
 obj-y += setup.o
diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c
index cfe87671e9..9a17d61e3b 100644
--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -374,6 +374,18 @@ extern u32 pmtmr_ioport;
 #endif
 
 #ifdef CONFIG_ACPI_SLEEP
+#define acpi_fadt_copy_address(dst, src, len) do {			\
+	if (fadt->header.revision >= FADT2_REVISION_ID)			\
+		acpi_sinfo.dst##_blk = fadt->x##src##_block;		\
+	if (!acpi_sinfo.dst##_blk.address) {				\
+		acpi_sinfo.dst##_blk.address      = fadt->src##_block;	\
+		acpi_sinfo.dst##_blk.space_id     = ACPI_ADR_SPACE_SYSTEM_IO; \
+		acpi_sinfo.dst##_blk.bit_width    = fadt->len##_length << 3; \
+		acpi_sinfo.dst##_blk.bit_offset   = 0;			\
+		acpi_sinfo.dst##_blk.access_width = 0;			\
+	} \
+} while (0)
+
 /* Get pm1x_cnt and pm1x_evt information for ACPI sleep */
 static void __init
 acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt)
@@ -388,37 +400,18 @@ acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt)
 		goto bad;
 	rsdp = __va(rsdp_phys);
 
-	if (fadt->header.revision >= FADT2_REVISION_ID) {
-		memcpy(&acpi_sinfo.pm1a_cnt_blk, &fadt->xpm1a_control_block,
-			sizeof(struct acpi_generic_address));
-		memcpy(&acpi_sinfo.pm1b_cnt_blk, &fadt->xpm1b_control_block,
-			sizeof(struct acpi_generic_address));
-		memcpy(&acpi_sinfo.pm1a_evt_blk, &fadt->xpm1a_event_block,
-			sizeof(struct acpi_generic_address));
-		memcpy(&acpi_sinfo.pm1b_evt_blk, &fadt->xpm1b_event_block,
-			sizeof(struct acpi_generic_address));
-	} else {
-		acpi_sinfo.pm1a_cnt_blk.address = fadt->pm1a_control_block;
-		acpi_sinfo.pm1b_cnt_blk.address = fadt->pm1b_control_block;
-		acpi_sinfo.pm1a_evt_blk.address = fadt->pm1a_event_block;
-		acpi_sinfo.pm1b_evt_blk.address = fadt->pm1b_event_block;
-		acpi_sinfo.pm1a_cnt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
-		acpi_sinfo.pm1b_cnt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
-		acpi_sinfo.pm1a_evt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
-		acpi_sinfo.pm1b_evt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
-		acpi_sinfo.pm1a_cnt_blk.bit_width = 16;
-		acpi_sinfo.pm1b_cnt_blk.bit_width = 16;
-		acpi_sinfo.pm1a_evt_blk.bit_width = 16;
-		acpi_sinfo.pm1b_evt_blk.bit_width = 16;
-		acpi_sinfo.pm1a_cnt_blk.bit_offset = 0;
-		acpi_sinfo.pm1b_cnt_blk.bit_offset = 0;
-		acpi_sinfo.pm1a_evt_blk.bit_offset = 0;
-		acpi_sinfo.pm1b_evt_blk.bit_offset = 0;
-		acpi_sinfo.pm1a_cnt_blk.access_width = 0;
-		acpi_sinfo.pm1b_cnt_blk.access_width = 0;
-		acpi_sinfo.pm1a_evt_blk.access_width = 0;
-		acpi_sinfo.pm1b_evt_blk.access_width = 0;
-	}
+	acpi_fadt_copy_address(pm1a_cnt, pm1a_control, pm1_control);
+	acpi_fadt_copy_address(pm1b_cnt, pm1b_control, pm1_control);
+	acpi_fadt_copy_address(pm1a_evt, pm1a_event, pm1_event);
+	acpi_fadt_copy_address(pm1b_evt, pm1b_event, pm1_event);
+
+	printk(KERN_INFO PREFIX
+	       "ACPI SLEEP INFO: pm1x_cnt[%"PRIx64",%"PRIx64"], "
+	       "pm1x_evt[%"PRIx64",%"PRIx64"]\n",
+	       acpi_sinfo.pm1a_cnt_blk.address,
+	       acpi_sinfo.pm1b_cnt_blk.address,
+	       acpi_sinfo.pm1a_evt_blk.address,
+	       acpi_sinfo.pm1b_evt_blk.address);
 
 	/* Now FACS... */
 	if (fadt->header.revision >= FADT2_REVISION_ID)
@@ -461,13 +454,6 @@ acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt)
 	}
 
 	printk(KERN_INFO PREFIX
-	       "ACPI SLEEP INFO: pm1x_cnt[%"PRIx64",%"PRIx64"], "
-	       "pm1x_evt[%"PRIx64",%"PRIx64"]\n",
-	       acpi_sinfo.pm1a_cnt_blk.address,
-	       acpi_sinfo.pm1b_cnt_blk.address,
-	       acpi_sinfo.pm1a_evt_blk.address,
-	       acpi_sinfo.pm1b_evt_blk.address);
-	printk(KERN_INFO PREFIX
 	       "                 wakeup_vec[%"PRIx64"], vec_size[%x]\n",
 	       acpi_sinfo.wakeup_vector, acpi_sinfo.vector_width);
 	return;
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 909a73f3fa..f0253152bc 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -3,6 +3,7 @@
 #include <xen/bitops.h>
 #include <xen/mm.h>
 #include <xen/smp.h>
+#include <xen/pci.h>
 #include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/processor.h>
@@ -66,19 +67,6 @@ static int c1_ramping_may_cause_clock_drift(struct cpuinfo_x86 *c)
 	return 1;
 }
 
-/* PCI access functions. Should be safe to use 0xcf8/0xcfc port accesses here. */
-static u8 pci_read_byte(u32 bus, u32 dev, u32 fn, u32 reg)
-{
-	outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8);
-	return inb(0xcfc + (reg & 3));
-}
-
-static void pci_write_byte(u32 bus, u32 dev, u32 fn, u32 reg, u8 val)
-{
-	outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8);
-	outb(val, 0xcfc + (reg & 3));
-}
-
 /*
  * Disable C1-Clock ramping if enabled in PMM7.CpuLowPwrEnh on 8th-generation
  * cores only. Assume BIOS has setup all Northbridges equivalently.
@@ -90,12 +78,12 @@ static void disable_c1_ramping(void)
 
 	for (node=0; node < NR_CPUS; node++) {
 		/* PMM7: bus=0, dev=0x18+node, function=0x3, register=0x87. */
-		pmm7 = pci_read_byte(0, 0x18+node, 0x3, 0x87);
+		pmm7 = pci_conf_read8(0, 0x18+node, 0x3, 0x87);
 		/* Invalid read means we've updated every Northbridge. */
 		if (pmm7 == 0xFF)
 			break;
 		pmm7 &= 0xFC; /* clear pmm7[1:0] */
-		pci_write_byte(0, 0x18+node, 0x3, 0x87, pmm7);
+		pci_conf_write8(0, 0x18+node, 0x3, 0x87, pmm7);
 		printk ("AMD: Disabling C1 Clock Ramping Node #%x\n", node);
 	}
 }
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index c56db37b37..4418c51ff9 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -46,6 +46,7 @@
 #include <asm/debugreg.h>
 #include <asm/msr.h>
 #include <asm/nmi.h>
+#include <xen/numa.h>
 #include <xen/iommu.h>
 #ifdef CONFIG_COMPAT
 #include <compat/vcpu.h>
@@ -171,7 +172,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
 
     if ( !d->arch.mm_arg_xlat_l3 )
     {
-        pg = alloc_domheap_page(NULL);
+        pg = alloc_domheap_page(NULL, 0);
         if ( !pg )
             return -ENOMEM;
         d->arch.mm_arg_xlat_l3 = page_to_virt(pg);
@@ -189,7 +190,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
 
         if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) )
         {
-            pg = alloc_domheap_page(NULL);
+            pg = alloc_domheap_page(NULL, 0);
             if ( !pg )
                 return -ENOMEM;
             clear_page(page_to_virt(pg));
@@ -198,7 +199,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
         l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]);
         if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) )
         {
-            pg = alloc_domheap_page(NULL);
+            pg = alloc_domheap_page(NULL, 0);
             if ( !pg )
                 return -ENOMEM;
             clear_page(page_to_virt(pg));
@@ -206,7 +207,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab)
         }
         l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]);
         BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)]));
-        pg = alloc_domheap_page(NULL);
+        pg = alloc_domheap_page(NULL, 0);
         if ( !pg )
             return -ENOMEM;
         l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR);
@@ -252,7 +253,7 @@ static void release_arg_xlat_area(struct domain *d)
 
 static int setup_compat_l4(struct vcpu *v)
 {
-    struct page_info *pg = alloc_domheap_page(NULL);
+    struct page_info *pg = alloc_domheap_page(NULL, 0);
     l4_pgentry_t *l4tab;
     int rc;
 
@@ -477,7 +478,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
 
 #else /* __x86_64__ */
 
-    if ( (pg = alloc_domheap_page(NULL)) == NULL )
+    pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
+    if ( pg == NULL )
         goto fail;
     d->arch.mm_perdomain_l2 = page_to_virt(pg);
     clear_page(d->arch.mm_perdomain_l2);
@@ -486,7 +488,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
             l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
                           __PAGE_HYPERVISOR);
 
-    if ( (pg = alloc_domheap_page(NULL)) == NULL )
+    pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
+    if ( pg == NULL )
         goto fail;
     d->arch.mm_perdomain_l3 = page_to_virt(pg);
     clear_page(d->arch.mm_perdomain_l3);
@@ -500,13 +503,15 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
 #endif
 
-    paging_domain_init(d);
+    if ( (rc = paging_domain_init(d)) != 0 )
+        goto fail;
     paging_initialised = 1;
 
     if ( !is_idle_domain(d) )
     {
         d->arch.ioport_caps = 
             rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
+        rc = -ENOMEM;
         if ( d->arch.ioport_caps == NULL )
             goto fail;
 
@@ -946,9 +951,9 @@ arch_do_vcpu_op(
         if ( copy_from_guest(&info, arg, 1) )
             break;
 
-        LOCK_BIGLOCK(d);
+        domain_lock(d);
         rc = map_vcpu_info(v, info.mfn, info.offset);
-        UNLOCK_BIGLOCK(d);
+        domain_unlock(d);
 
         break;
     }
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index dc8ee52f07..56106bae2f 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -630,7 +630,7 @@ int __init construct_dom0(
     }
     else
     {
-        page = alloc_domheap_page(NULL);
+        page = alloc_domheap_page(NULL, 0);
         if ( !page )
             panic("Not enough RAM for domain 0 PML4.\n");
         l4start = l4tab = page_to_virt(page);
@@ -957,6 +957,8 @@ int __init construct_dom0(
     rc |= ioports_deny_access(dom0, 0x40, 0x43);
     /* PIT Channel 2 / PC Speaker Control. */
     rc |= ioports_deny_access(dom0, 0x61, 0x61);
+    /* PCI configuration spaces. */
+    rc |= ioports_deny_access(dom0, 0xcf8, 0xcff);
     /* Command-line I/O ranges. */
     process_dom0_ioports_disable();
 
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 57065f7625..d7bf9f3f2f 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -20,12 +20,13 @@
 #include <asm/hvm/support.h>
 
 static int hvmemul_do_io(
-    int is_mmio, paddr_t addr, unsigned long count, int size,
+    int is_mmio, paddr_t addr, unsigned long *reps, int size,
     paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
 {
     struct vcpu *curr = current;
     vcpu_iodata_t *vio = get_ioreq(curr);
     ioreq_t *p = &vio->vp_ioreq;
+    int rc;
 
     switch ( curr->arch.hvm_vcpu.io_state )
     {
@@ -41,52 +42,72 @@ static int hvmemul_do_io(
         return X86EMUL_UNHANDLEABLE;
     }
 
-    curr->arch.hvm_vcpu.io_state =
-        (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
-
     if ( p->state != STATE_IOREQ_NONE )
+    {
         gdprintk(XENLOG_WARNING, "WARNING: io already pending (%d)?\n",
                  p->state);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    curr->arch.hvm_vcpu.io_state =
+        (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
 
     p->dir = dir;
     p->data_is_ptr = value_is_ptr;
     p->type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO;
     p->size = size;
     p->addr = addr;
-    p->count = count;
+    p->count = *reps;
     p->df = df;
     p->data = value;
     p->io_count++;
 
-    if ( is_mmio
-         ? (hvm_mmio_intercept(p) || hvm_buffered_io_intercept(p))
-         : hvm_portio_intercept(p) )
+    if ( is_mmio )
     {
+        rc = hvm_mmio_intercept(p);
+        if ( rc == X86EMUL_UNHANDLEABLE )
+            rc = hvm_buffered_io_intercept(p);
+    }
+    else
+    {
+        rc = hvm_portio_intercept(p);
+    }
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+    case X86EMUL_RETRY:
+        *reps = p->count;
         p->state = STATE_IORESP_READY;
         hvm_io_assist();
         if ( val != NULL )
             *val = curr->arch.hvm_vcpu.io_data;
         curr->arch.hvm_vcpu.io_state = HVMIO_none;
-        return X86EMUL_OKAY;
+        break;
+    case X86EMUL_UNHANDLEABLE:
+        hvm_send_assist_req(curr);
+        rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
+        break;
+    default:
+        BUG();
     }
 
-    hvm_send_assist_req(curr);
-    return (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
+    return rc;
 }
 
 static int hvmemul_do_pio(
-    unsigned long port, unsigned long count, int size,
+    unsigned long port, unsigned long *reps, int size,
     paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
 {
-    return hvmemul_do_io(0, port, count, size, value,
+    return hvmemul_do_io(0, port, reps, size, value,
                          dir, df, value_is_ptr, val);
 }
 
 static int hvmemul_do_mmio(
-    paddr_t gpa, unsigned long count, int size,
+    paddr_t gpa, unsigned long *reps, int size,
     paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
 {
-    return hvmemul_do_io(1, gpa, count, size, value,
+    return hvmemul_do_io(1, gpa, reps, size, value,
                          dir, df, value_is_ptr, val);
 }
 
@@ -206,7 +227,7 @@ static int __hvmemul_read(
     struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     struct vcpu *curr = current;
-    unsigned long addr;
+    unsigned long addr, reps = 1;
     uint32_t pfec = PFEC_page_present;
     paddr_t gpa;
     int rc;
@@ -226,7 +247,8 @@ static int __hvmemul_read(
             return X86EMUL_UNHANDLEABLE;
         gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
         if ( (off + bytes) <= PAGE_SIZE )
-            return hvmemul_do_mmio(gpa, 1, bytes, 0, IOREQ_READ, 0, 0, val);
+            return hvmemul_do_mmio(gpa, &reps, bytes, 0,
+                                   IOREQ_READ, 0, 0, val);
     }
 
     if ( (seg != x86_seg_none) &&
@@ -251,7 +273,7 @@ static int __hvmemul_read(
         if ( rc != X86EMUL_OKAY )
             return rc;
 
-        return hvmemul_do_mmio(gpa, 1, bytes, 0, IOREQ_READ, 0, 0, val);
+        return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
     }
 
     return X86EMUL_OKAY;
@@ -302,7 +324,7 @@ static int hvmemul_write(
     struct hvm_emulate_ctxt *hvmemul_ctxt =
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
     struct vcpu *curr = current;
-    unsigned long addr;
+    unsigned long addr, reps = 1;
     uint32_t pfec = PFEC_page_present | PFEC_write_access;
     paddr_t gpa;
     int rc;
@@ -318,8 +340,8 @@ static int hvmemul_write(
         unsigned int off = addr & (PAGE_SIZE - 1);
         gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
         if ( (off + bytes) <= PAGE_SIZE )
-            return hvmemul_do_mmio(gpa, 1, bytes, val, IOREQ_WRITE,
-                                   0, 0, NULL);
+            return hvmemul_do_mmio(gpa, &reps, bytes, val,
+                                   IOREQ_WRITE, 0, 0, NULL);
     }
 
     if ( (seg != x86_seg_none) &&
@@ -339,7 +361,8 @@ static int hvmemul_write(
         if ( rc != X86EMUL_OKAY )
             return rc;
 
-        return hvmemul_do_mmio(gpa, 1, bytes, val, IOREQ_WRITE, 0, 0, NULL);
+        return hvmemul_do_mmio(gpa, &reps, bytes, val,
+                               IOREQ_WRITE, 0, 0, NULL);
     }
 
     return X86EMUL_OKAY;
@@ -386,7 +409,7 @@ static int hvmemul_rep_ins(
     if ( rc != X86EMUL_OKAY )
         return rc;
 
-    return hvmemul_do_pio(src_port, *reps, bytes_per_rep, gpa, IOREQ_READ,
+    return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
                           !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
 }
 
@@ -419,7 +442,7 @@ static int hvmemul_rep_outs(
     if ( rc != X86EMUL_OKAY )
         return rc;
 
-    return hvmemul_do_pio(dst_port, *reps, bytes_per_rep, gpa, IOREQ_WRITE,
+    return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE,
                           !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
 }
 
@@ -469,14 +492,14 @@ static int hvmemul_rep_movs(
     (void)gfn_to_mfn_current(sgpa >> PAGE_SHIFT, &p2mt);
     if ( !p2m_is_ram(p2mt) )
         return hvmemul_do_mmio(
-            sgpa, *reps, bytes_per_rep, dgpa, IOREQ_READ,
+            sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ,
             !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
 
     (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt);
     if ( p2m_is_ram(p2mt) )
         return X86EMUL_UNHANDLEABLE;
     return hvmemul_do_mmio(
-        dgpa, *reps, bytes_per_rep, sgpa, IOREQ_WRITE,
+        dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE,
         !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
 }
 
@@ -513,7 +536,8 @@ static int hvmemul_read_io(
     unsigned long *val,
     struct x86_emulate_ctxt *ctxt)
 {
-    return hvmemul_do_pio(port, 1, bytes, 0, IOREQ_READ, 0, 0, val);
+    unsigned long reps = 1;
+    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
 }
 
 static int hvmemul_write_io(
@@ -522,7 +546,8 @@ static int hvmemul_write_io(
     unsigned long val,
     struct x86_emulate_ctxt *ctxt)
 {
-    return hvmemul_do_pio(port, 1, bytes, val, IOREQ_WRITE, 0, 0, NULL);
+    unsigned long reps = 1;
+    return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL);
 }
 
 static int hvmemul_read_cr(
diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c
index 49ca998d37..03dfbf3bd8 100644
--- a/xen/arch/x86/hvm/hpet.c
+++ b/xen/arch/x86/hvm/hpet.c
@@ -150,8 +150,9 @@ static inline uint64_t hpet_read_maincounter(HPETState *h)
         return h->hpet.mc64;
 }
 
-static unsigned long hpet_read(
-    struct vcpu *v, unsigned long addr, unsigned long length)
+static int hpet_read(
+    struct vcpu *v, unsigned long addr, unsigned long length,
+    unsigned long *pval)
 {
     HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
     unsigned long result;
@@ -160,7 +161,10 @@ static unsigned long hpet_read(
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
-        return ~0UL;
+    {
+        result = ~0ul;
+        goto out;
+    }
 
     spin_lock(&h->lock);
 
@@ -174,7 +178,9 @@ static unsigned long hpet_read(
 
     spin_unlock(&h->lock);
 
-    return result;
+ out:
+    *pval = result;
+    return X86EMUL_OKAY;
 }
 
 static void hpet_stop_timer(HPETState *h, unsigned int tn)
@@ -234,7 +240,7 @@ static inline uint64_t hpet_fixup_reg(
     return new;
 }
 
-static void hpet_write(
+static int hpet_write(
     struct vcpu *v, unsigned long addr,
     unsigned long length, unsigned long val)
 {
@@ -245,7 +251,7 @@ static void hpet_write(
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
-        return;
+        goto out;
 
     spin_lock(&h->lock);
 
@@ -349,6 +355,9 @@ static void hpet_write(
     }
 
     spin_unlock(&h->lock);
+
+ out:
+    return X86EMUL_OKAY;
 }
 
 static int hpet_range(struct vcpu *v, unsigned long addr)
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 961bfbf354..97a1aaa17c 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -181,7 +181,8 @@ void hvm_do_resume(struct vcpu *v)
             break;
         default:
             gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state);
-            domain_crash_synchronous();
+            domain_crash(v->domain);
+            return; /* bail */
         }
     }
 }
@@ -276,7 +277,7 @@ static int hvm_print_line(
     }
     spin_unlock(&hd->pbuf_lock);
 
-    return 1;
+    return X86EMUL_OKAY;
 }
 
 int hvm_domain_initialise(struct domain *d)
@@ -478,11 +479,11 @@ static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
     vc = &v->arch.guest_context;
 
     /* Need to init this vcpu before loading its contents */
-    LOCK_BIGLOCK(d);
+    domain_lock(d);
     if ( !v->is_initialised )
         if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 )
             return rc;
-    UNLOCK_BIGLOCK(d);
+    domain_unlock(d);
 
     if ( hvm_load_entry(CPU, h, &ctxt) != 0 ) 
         return -EINVAL;
@@ -687,47 +688,26 @@ void hvm_vcpu_destroy(struct vcpu *v)
     /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
 }
 
-
-void hvm_vcpu_reset(struct vcpu *v)
-{
-    vcpu_pause(v);
-
-    vlapic_reset(vcpu_vlapic(v));
-
-    hvm_funcs.vcpu_initialise(v);
-
-    set_bit(_VPF_down, &v->pause_flags);
-    clear_bit(_VPF_blocked, &v->pause_flags);
-    v->fpu_initialised = 0;
-    v->fpu_dirtied     = 0;
-    v->is_initialised  = 0;
-
-    vcpu_unpause(v);
-}
-
-static void hvm_vcpu_down(void)
+void hvm_vcpu_down(struct vcpu *v)
 {
-    struct vcpu *v = current;
     struct domain *d = v->domain;
     int online_count = 0;
 
-    gdprintk(XENLOG_INFO, "VCPU%d: going offline.\n", v->vcpu_id);
-
     /* Doesn't halt us immediately, but we'll never return to guest context. */
     set_bit(_VPF_down, &v->pause_flags);
     vcpu_sleep_nosync(v);
 
     /* Any other VCPUs online? ... */
-    LOCK_BIGLOCK(d);
+    domain_lock(d);
     for_each_vcpu ( d, v )
         if ( !test_bit(_VPF_down, &v->pause_flags) )
             online_count++;
-    UNLOCK_BIGLOCK(d);
+    domain_unlock(d);
 
     /* ... Shut down the domain if not. */
     if ( online_count == 0 )
     {
-        gdprintk(XENLOG_INFO, "all CPUs offline -- powering off.\n");
+        gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n");
         domain_shutdown(d, SHUTDOWN_poweroff);
     }
 }
@@ -742,9 +722,10 @@ void hvm_send_assist_req(struct vcpu *v)
     p = &get_ioreq(v)->vp_ioreq;
     if ( unlikely(p->state != STATE_IOREQ_NONE) )
     {
-        /* This indicates a bug in the device model.  Crash the domain. */
+        /* This indicates a bug in the device model. Crash the domain. */
         gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state);
-        domain_crash_synchronous();
+        domain_crash(v->domain);
+        return;
     }
 
     prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port);
@@ -765,7 +746,7 @@ void hvm_hlt(unsigned long rflags)
      * out of this.
      */
     if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
-        return hvm_vcpu_down();
+        return hvm_vcpu_down(current);
 
     do_sched_op_compat(SCHEDOP_block, 0);
 }
@@ -1894,79 +1875,6 @@ void hvm_hypercall_page_initialise(struct domain *d,
     hvm_funcs.init_hypercall_page(d, hypercall_page);
 }
 
-int hvm_bringup_ap(int vcpuid, int trampoline_vector)
-{
-    struct domain *d = current->domain;
-    struct vcpu *v;
-    struct vcpu_guest_context *ctxt;
-    struct segment_register reg;
-
-    ASSERT(is_hvm_domain(d));
-
-    if ( (v = d->vcpu[vcpuid]) == NULL )
-        return -ENOENT;
-
-    v->fpu_initialised = 0;
-    v->arch.flags |= TF_kernel_mode;
-    v->is_initialised = 1;
-
-    ctxt = &v->arch.guest_context;
-    memset(ctxt, 0, sizeof(*ctxt));
-    ctxt->flags = VGCF_online;
-    ctxt->user_regs.eflags = 2;
-
-    v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
-    hvm_update_guest_cr(v, 0);
-
-    v->arch.hvm_vcpu.guest_cr[2] = 0;
-    hvm_update_guest_cr(v, 2);
-
-    v->arch.hvm_vcpu.guest_cr[3] = 0;
-    hvm_update_guest_cr(v, 3);
-
-    v->arch.hvm_vcpu.guest_cr[4] = 0;
-    hvm_update_guest_cr(v, 4);
-
-    v->arch.hvm_vcpu.guest_efer = 0;
-    hvm_update_guest_efer(v);
-
-    reg.sel = trampoline_vector << 8;
-    reg.base = (uint32_t)reg.sel << 4;
-    reg.limit = 0xffff;
-    reg.attr.bytes = 0x89b;
-    hvm_set_segment_register(v, x86_seg_cs, &reg);
-
-    reg.sel = reg.base = 0;
-    reg.limit = 0xffff;
-    reg.attr.bytes = 0x893;
-    hvm_set_segment_register(v, x86_seg_ds, &reg);
-    hvm_set_segment_register(v, x86_seg_es, &reg);
-    hvm_set_segment_register(v, x86_seg_fs, &reg);
-    hvm_set_segment_register(v, x86_seg_gs, &reg);
-    hvm_set_segment_register(v, x86_seg_ss, &reg);
-
-    reg.attr.bytes = 0x82; /* LDT */
-    hvm_set_segment_register(v, x86_seg_ldtr, &reg);
-
-    reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */
-    hvm_set_segment_register(v, x86_seg_tr, &reg);
-
-    reg.attr.bytes = 0;
-    hvm_set_segment_register(v, x86_seg_gdtr, &reg);
-    hvm_set_segment_register(v, x86_seg_idtr, &reg);
-
-    /* Sync AP's TSC with BSP's. */
-    v->arch.hvm_vcpu.cache_tsc_offset =
-        v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
-    hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
-
-    if ( test_and_clear_bit(_VPF_down, &v->pause_flags) )
-        vcpu_wake(v);
-
-    gdprintk(XENLOG_INFO, "AP %d bringup succeeded.\n", vcpuid);
-    return 0;
-}
-
 static int hvmop_set_pci_intx_level(
     XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop)
 {
@@ -2185,13 +2093,16 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
 
         if ( op == HVMOP_set_param )
         {
+            rc = 0;
+
             switch ( a.index )
             {
             case HVM_PARAM_IOREQ_PFN:
                 iorp = &d->arch.hvm_domain.ioreq;
-                rc = hvm_set_ioreq_page(d, iorp, a.value);
+                if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 )
+                    break;
                 spin_lock(&iorp->lock);
-                if ( (rc == 0) && (iorp->va != NULL) )
+                if ( iorp->va != NULL )
                     /* Initialise evtchn port info if VCPUs already created. */
                     for_each_vcpu ( d, v )
                         get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
@@ -2206,13 +2117,72 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
                 hvm_latch_shinfo_size(d);
                 break;
             case HVM_PARAM_TIMER_MODE:
-                rc = -EINVAL;
                 if ( a.value > HVMPTM_one_missed_tick_pending )
-                    goto param_fail;
+                    rc = -EINVAL;
+                break;
+            case HVM_PARAM_IDENT_PT:
+                rc = -EPERM;
+                if ( !IS_PRIV(current->domain) )
+                    break;
+
+                rc = -EINVAL;
+                if ( d->arch.hvm_domain.params[a.index] != 0 )
+                    break;
+
+                rc = 0;
+                if ( !paging_mode_hap(d) )
+                    break;
+
+                domain_pause(d);
+
+                /*
+                 * Update GUEST_CR3 in each VMCS to point at identity map.
+                 * All foreign updates to guest state must synchronise on
+                 * the domctl_lock.
+                 */
+                spin_lock(&domctl_lock);
+                d->arch.hvm_domain.params[a.index] = a.value;
+                for_each_vcpu ( d, v )
+                    paging_update_cr3(v);
+                spin_unlock(&domctl_lock);
+
+                domain_unpause(d);
+                break;
+            case HVM_PARAM_DM_DOMAIN:
+                /* Privileged domains only, as we must domain_pause(d). */
+                rc = -EPERM;
+                if ( !IS_PRIV_FOR(current->domain, d) )
+                    break;
+
+                if ( a.value == DOMID_SELF )
+                    a.value = current->domain->domain_id;
+
+                rc = 0;
+                domain_pause(d); /* safe to change per-vcpu xen_port */
+                iorp = &d->arch.hvm_domain.ioreq;
+                for_each_vcpu ( d, v )
+                {
+                    int old_port, new_port;
+                    new_port = alloc_unbound_xen_event_channel(v, a.value);
+                    if ( new_port < 0 )
+                    {
+                        rc = new_port;
+                        break;
+                    }
+                    /* xchg() ensures that only we free_xen_event_channel() */
+                    old_port = xchg(&v->arch.hvm_vcpu.xen_port, new_port);
+                    free_xen_event_channel(v, old_port);
+                    spin_lock(&iorp->lock);
+                    if ( iorp->va != NULL )
+                        get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port;
+                    spin_unlock(&iorp->lock);
+                }
+                domain_unpause(d);
                 break;
             }
-            d->arch.hvm_domain.params[a.index] = a.value;
-            rc = 0;
+
+            if ( rc == 0 )
+                d->arch.hvm_domain.params[a.index] = a.value;
         }
         else
         {
diff --git a/xen/arch/x86/hvm/i8254.c b/xen/arch/x86/hvm/i8254.c
index 01c78f7799..493b7317b9 100644
--- a/xen/arch/x86/hvm/i8254.c
+++ b/xen/arch/x86/hvm/i8254.c
@@ -401,50 +401,6 @@ void pit_stop_channel0_irq(PITState *pit)
     spin_unlock(&pit->lock);
 }
 
-#ifdef HVM_DEBUG_SUSPEND
-static void pit_info(PITState *pit)
-{
-    struct hvm_hw_pit_channel *s;
-    struct periodic_time *pt;
-    int i;
-
-    for ( i = 0; i < 3; i++ )
-    {
-        printk("*****pit channel %d's state:*****\n", i);
-        s = &pit->hw.channels[i];
-        printk("pit 0x%x.\n", s->count);
-        printk("pit 0x%x.\n", s->latched_count);
-        printk("pit 0x%x.\n", s->count_latched);
-        printk("pit 0x%x.\n", s->status_latched);
-        printk("pit 0x%x.\n", s->status);
-        printk("pit 0x%x.\n", s->read_state);
-        printk("pit 0x%x.\n", s->write_state);
-        printk("pit 0x%x.\n", s->write_latch);
-        printk("pit 0x%x.\n", s->rw_mode);
-        printk("pit 0x%x.\n", s->mode);
-        printk("pit 0x%x.\n", s->bcd);
-        printk("pit 0x%x.\n", s->gate);
-        printk("pit %"PRId64"\n", pit->count_load_time[i]);
-
-    }
-
-    pt = &pit->pt0;
-    printk("pit channel 0 periodic timer:\n", i);
-    printk("pt %d.\n", pt->enabled);
-    printk("pt %d.\n", pt->one_shot);
-    printk("pt %d.\n", pt->irq);
-    printk("pt %d.\n", pt->first_injected);
-    printk("pt %d.\n", pt->pending_intr_nr);
-    printk("pt %d.\n", pt->period);
-    printk("pt %"PRId64"\n", pt->period_cycles);
-    printk("pt %"PRId64"\n", pt->last_plt_gtime);
-}
-#else
-static void pit_info(PITState *pit)
-{
-}
-#endif
-
 static int pit_save(struct domain *d, hvm_domain_context_t *h)
 {
     PITState *pit = domain_vpit(d);
@@ -452,9 +408,6 @@ static int pit_save(struct domain *d, hvm_domain_context_t *h)
 
     spin_lock(&pit->lock);
     
-    pit_info(pit);
-
-    /* Save the PIT hardware state */
     rc = hvm_save_entry(PIT, 0, h, &pit->hw);
 
     spin_unlock(&pit->lock);
@@ -469,22 +422,21 @@ static int pit_load(struct domain *d, hvm_domain_context_t *h)
 
     spin_lock(&pit->lock);
 
-    /* Restore the PIT hardware state */
     if ( hvm_load_entry(PIT, h, &pit->hw) )
     {
         spin_unlock(&pit->lock);
         return 1;
     }
     
-    /* Recreate platform timers from hardware state.  There will be some 
+    /*
+     * Recreate platform timers from hardware state.  There will be some 
      * time jitter here, but the wall-clock will have jumped massively, so 
-     * we hope the guest can handle it. */
+     * we hope the guest can handle it.
+     */
     pit->pt0.last_plt_gtime = hvm_get_guest_time(d->vcpu[0]);
     for ( i = 0; i < 3; i++ )
         pit_load_count(pit, i, pit->hw.channels[i].count);
 
-    pit_info(pit);
-
     spin_unlock(&pit->lock);
 
     return 0;
@@ -535,7 +487,7 @@ static int handle_pit_io(
     if ( bytes != 1 )
     {
         gdprintk(XENLOG_WARNING, "PIT bad access\n");
-        return 1;
+        return X86EMUL_OKAY;
     }
 
     if ( dir == IOREQ_WRITE )
@@ -550,7 +502,7 @@ static int handle_pit_io(
             gdprintk(XENLOG_WARNING, "PIT: read A1:A0=3!\n");
     }
 
-    return 1;
+    return X86EMUL_OKAY;
 }
 
 static void speaker_ioport_write(
@@ -574,11 +526,7 @@ static int handle_speaker_io(
 {
     struct PITState *vpit = vcpu_vpit(current);
 
-    if ( bytes != 1 )
-    {
-        gdprintk(XENLOG_WARNING, "PIT_SPEAKER bad access\n");
-        return 1;
-    }
+    BUG_ON(bytes != 1);
 
     spin_lock(&vpit->lock);
 
@@ -589,7 +537,7 @@ static int handle_speaker_io(
 
     spin_unlock(&vpit->lock);
 
-    return 1;
+    return X86EMUL_OKAY;
 }
 
 int pv_pit_handler(int port, int data, int write)
diff --git a/xen/arch/x86/hvm/intercept.c b/xen/arch/x86/hvm/intercept.c
index 04c5da7b6f..0e110e00dc 100644
--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -45,53 +45,63 @@ static struct hvm_mmio_handler *hvm_mmio_handlers[HVM_MMIO_HANDLER_NR] =
     &vioapic_mmio_handler
 };
 
-static inline void hvm_mmio_access(struct vcpu *v,
-                                   ioreq_t *p,
-                                   hvm_mmio_read_t read_handler,
-                                   hvm_mmio_write_t write_handler)
+static int hvm_mmio_access(struct vcpu *v,
+                           ioreq_t *p,
+                           hvm_mmio_read_t read_handler,
+                           hvm_mmio_write_t write_handler)
 {
     unsigned long data;
+    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
 
-    switch ( p->type )
+    if ( !p->data_is_ptr )
     {
-    case IOREQ_TYPE_COPY:
-        if ( !p->data_is_ptr ) {
-            if ( p->dir == IOREQ_READ )
-                p->data = read_handler(v, p->addr, p->size);
-            else    /* p->dir == IOREQ_WRITE */
-                write_handler(v, p->addr, p->size, p->data);
-        } else {    /* p->data_is_ptr */
-            int i, sign = (p->df) ? -1 : 1;
-
-            if ( p->dir == IOREQ_READ ) {
-                for ( i = 0; i < p->count; i++ ) {
-                    data = read_handler(v,
-                        p->addr + (sign * i * p->size),
-                        p->size);
-                    (void)hvm_copy_to_guest_phys(
-                        p->data + (sign * i * p->size),
-                        &data,
-                        p->size);
-                }
-            } else {/* p->dir == IOREQ_WRITE */
-                for ( i = 0; i < p->count; i++ ) {
-                    (void)hvm_copy_from_guest_phys(
-                        &data,
-                        p->data + (sign * i * p->size),
-                        p->size);
-                    write_handler(v,
-                        p->addr + (sign * i * p->size),
-                        p->size, data);
-                }
-            }
+        if ( p->dir == IOREQ_READ )
+        {
+            rc = read_handler(v, p->addr, p->size, &data);
+            p->data = data;
         }
-        break;
+        else /* p->dir == IOREQ_WRITE */
+            rc = write_handler(v, p->addr, p->size, p->data);
+        return rc;
+    }
 
-    default:
-        printk("hvm_mmio_access: error ioreq type %x\n", p->type);
-        domain_crash_synchronous();
-        break;
+    if ( p->dir == IOREQ_READ )
+    {
+        for ( i = 0; i < p->count; i++ )
+        {
+            rc = read_handler(
+                v,
+                p->addr + (sign * i * p->size),
+                p->size, &data);
+            if ( rc != X86EMUL_OKAY )
+                break;
+            (void)hvm_copy_to_guest_phys(
+                p->data + (sign * i * p->size),
+                &data,
+                p->size);
+        }
+    }
+    else
+    {
+        for ( i = 0; i < p->count; i++ )
+        {
+            (void)hvm_copy_from_guest_phys(
+                &data,
+                p->data + (sign * i * p->size),
+                p->size);
+            rc = write_handler(
+                v,
+                p->addr + (sign * i * p->size),
+                p->size, data);
+            if ( rc != X86EMUL_OKAY )
+                break;
+        }
     }
+
+    if ( (p->count = i) != 0 )
+        rc = X86EMUL_OKAY;
+
+    return rc;
 }
 
 int hvm_mmio_intercept(ioreq_t *p)
@@ -100,60 +110,62 @@ int hvm_mmio_intercept(ioreq_t *p)
     int i;
 
     for ( i = 0; i < HVM_MMIO_HANDLER_NR; i++ )
-    {
         if ( hvm_mmio_handlers[i]->check_handler(v, p->addr) )
-        {
-            hvm_mmio_access(v, p,
-                            hvm_mmio_handlers[i]->read_handler,
-                            hvm_mmio_handlers[i]->write_handler);
-            return 1;
-        }
-    }
+            return hvm_mmio_access(
+                v, p,
+                hvm_mmio_handlers[i]->read_handler,
+                hvm_mmio_handlers[i]->write_handler);
 
-    return 0;
+    return X86EMUL_UNHANDLEABLE;
 }
 
 static int process_portio_intercept(portio_action_t action, ioreq_t *p)
 {
-    int rc = 1, i, sign = p->df ? -1 : 1;
+    int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1;
     uint32_t data;
 
-    if ( p->dir == IOREQ_READ )
+    if ( !p->data_is_ptr )
     {
-        if ( !p->data_is_ptr )
+        if ( p->dir == IOREQ_READ )
         {
             rc = action(IOREQ_READ, p->addr, p->size, &data);
             p->data = data;
         }
         else
         {
-            for ( i = 0; i < p->count; i++ )
-            {
-                rc = action(IOREQ_READ, p->addr, p->size, &data);
-                (void)hvm_copy_to_guest_phys(p->data + sign*i*p->size,
-                                             &data, p->size);
-            }
+            data = p->data;
+            rc = action(IOREQ_WRITE, p->addr, p->size, &data);
         }
+        return rc;
     }
-    else /* p->dir == IOREQ_WRITE */
+
+    if ( p->dir == IOREQ_READ )
     {
-        if ( !p->data_is_ptr )
+        for ( i = 0; i < p->count; i++ )
         {
-            data = p->data;
-            rc = action(IOREQ_WRITE, p->addr, p->size, &data);
+            rc = action(IOREQ_READ, p->addr, p->size, &data);
+            if ( rc != X86EMUL_OKAY )
+                break;
+            (void)hvm_copy_to_guest_phys(p->data + sign*i*p->size,
+                                         &data, p->size);
         }
-        else
+    }
+    else /* p->dir == IOREQ_WRITE */
+    {
+        for ( i = 0; i < p->count; i++ )
         {
-            for ( i = 0; i < p->count; i++ )
-            {
-                data = 0;
-                (void)hvm_copy_from_guest_phys(&data, p->data + sign*i*p->size,
-                                               p->size);
-                rc = action(IOREQ_WRITE, p->addr, p->size, &data);
-            }
+            data = 0;
+            (void)hvm_copy_from_guest_phys(&data, p->data + sign*i*p->size,
+                                           p->size);
+            rc = action(IOREQ_WRITE, p->addr, p->size, &data);
+            if ( rc != X86EMUL_OKAY )
+                break;
         }
     }
 
+    if ( (p->count = i) != 0 )
+        rc = X86EMUL_OKAY;
+
     return rc;
 }
 
@@ -170,7 +182,7 @@ int hvm_io_intercept(ioreq_t *p, int type)
     unsigned long addr, size;
 
     if ( (type == HVM_PORTIO) && (dpci_ioport_intercept(p)) )
-        return 1;
+        return X86EMUL_OKAY;
 
     for ( i = 0; i < handler->num_slot; i++ )
     {
@@ -188,10 +200,10 @@ int hvm_io_intercept(ioreq_t *p, int type)
         }
     }
 
-    return 0;
+    return X86EMUL_UNHANDLEABLE;
 }
 
-int register_io_handler(
+void register_io_handler(
     struct domain *d, unsigned long addr, unsigned long size,
     void *action, int type)
 {
@@ -207,9 +219,8 @@ int register_io_handler(
     else
         handler->hdl_list[num].action.mmio = action;
     handler->num_slot++;
-
-    return 1;
 }
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index ac1e62782a..6a8e0885c0 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -148,20 +148,19 @@ void send_timeoffset_req(unsigned long timeoff)
 void send_invalidate_req(void)
 {
     struct vcpu *v = current;
-    vcpu_iodata_t *vio;
+    vcpu_iodata_t *vio = get_ioreq(v);
     ioreq_t *p;
 
-    vio = get_ioreq(v);
-    if ( vio == NULL )
-    {
-        printk("bad shared page: %lx\n", (unsigned long) vio);
-        domain_crash_synchronous();
-    }
+    BUG_ON(vio == NULL);
 
     p = &vio->vp_ioreq;
     if ( p->state != STATE_IOREQ_NONE )
-        printk("WARNING: send invalidate req with something "
-               "already pending (%d)?\n", p->state);
+    {
+        gdprintk(XENLOG_ERR, "WARNING: send invalidate req with something "
+                 "already pending (%d)?\n", p->state);
+        domain_crash(v->domain);
+        return;
+    }
 
     p->type = IOREQ_TYPE_INVALIDATE;
     p->size = 4;
@@ -225,12 +224,6 @@ void hvm_io_assist(void)
     ioreq_t *p = &get_ioreq(curr)->vp_ioreq;
     enum hvm_io_state io_state;
 
-    if ( p->state != STATE_IORESP_READY )
-    {
-        gdprintk(XENLOG_ERR, "Unexpected HVM iorequest state %d.\n", p->state);
-        domain_crash_synchronous();
-    }
-
     rmb(); /* see IORESP_READY /then/ read contents of ioreq */
 
     p->state = STATE_IOREQ_NONE;
@@ -253,74 +246,59 @@ void hvm_io_assist(void)
 
 void dpci_ioport_read(uint32_t mport, ioreq_t *p)
 {
-    uint64_t i;
-    uint64_t z_data;
-    uint64_t length = (p->count * p->size);
+    int i, sign = p->df ? -1 : 1;
+    uint32_t data = 0;
 
-    for ( i = 0; i < length; i += p->size )
+    for ( i = 0; i < p->count; i++ )
     {
-        z_data = ~0ULL;
-        
         switch ( p->size )
         {
         case 1:
-            z_data = (uint64_t)inb(mport);
+            data = inb(mport);
             break;
         case 2:
-            z_data = (uint64_t)inw(mport);
+            data = inw(mport);
             break;
         case 4:
-            z_data = (uint64_t)inl(mport);
+            data = inl(mport);
             break;
         default:
-            gdprintk(XENLOG_ERR, "Error: unable to handle size: %"
-                     PRId64 "\n", p->size);
-            return;
+            BUG();
         }
 
-        p->data = z_data;
-        if ( p->data_is_ptr &&
-             hvm_copy_to_guest_phys(p->data + i, (void *)&z_data,
-                                    (int)p->size) )
-        {
-            gdprintk(XENLOG_ERR, "Error: couldn't copy to hvm phys\n");
-            return;
-        }
+        if ( p->data_is_ptr )
+            (void)hvm_copy_to_guest_phys(
+                p->data + (sign * i * p->size), &data, p->size);
+        else
+            p->data = data;
     }
 }
 
 void dpci_ioport_write(uint32_t mport, ioreq_t *p)
 {
-    uint64_t i;
-    uint64_t z_data = 0;
-    uint64_t length = (p->count * p->size);
+    int i, sign = p->df ? -1 : 1;
+    uint32_t data;
 
-    for ( i = 0; i < length; i += p->size )
+    for ( i = 0; i < p->count; i++ )
     {
-        z_data = p->data;
-        if ( p->data_is_ptr &&
-             hvm_copy_from_guest_phys((void *)&z_data,
-                                      p->data + i, (int)p->size) )
-        {
-            gdprintk(XENLOG_ERR, "Error: couldn't copy from hvm phys\n");
-            return;
-        }
+        data = p->data;
+        if ( p->data_is_ptr )
+            (void)hvm_copy_from_guest_phys(
+                &data, p->data + (sign * i & p->size), p->size);
 
         switch ( p->size )
         {
         case 1:
-            outb((uint8_t) z_data, mport);
+            outb(data, mport);
             break;
         case 2:
-            outw((uint16_t) z_data, mport);
+            outw(data, mport);
             break;
         case 4:
-            outl((uint32_t) z_data, mport);
+            outl(data, mport);
             break;
         default:
-            gdprintk(XENLOG_ERR, "Error: unable to handle size: %"
-                     PRId64 "\n", p->size);
-            break;
+            BUG();
         }
     }
 }
diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
index 3bd0dc9d7c..4e50680022 100644
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -266,7 +266,7 @@ static void setup_var_mtrrs(struct vcpu *v)
         {
             if ( e820_table[i].addr == 0x100000 )
             {
-                size = e820_table[i].size + 0x100000 + PAGE_SIZE * 4;
+                size = e820_table[i].size + 0x100000 + PAGE_SIZE * 5;
                 addr = 0;
             }
             else
diff --git a/xen/arch/x86/hvm/pmtimer.c b/xen/arch/x86/hvm/pmtimer.c
index 8d3fff8f44..4924a80687 100644
--- a/xen/arch/x86/hvm/pmtimer.c
+++ b/xen/arch/x86/hvm/pmtimer.c
@@ -169,7 +169,7 @@ static int handle_evt_io(
 
     spin_unlock(&s->lock);
 
-    return 1;
+    return X86EMUL_OKAY;
 }
 
 
@@ -183,7 +183,7 @@ static int handle_pmt_io(
     if ( bytes != 4 )
     {
         gdprintk(XENLOG_WARNING, "HVM_PMT bad access\n");
-        return 1;
+        return X86EMUL_OKAY;
     }
     
     if ( dir == IOREQ_READ )
@@ -192,10 +192,10 @@ static int handle_pmt_io(
         pmt_update_time(s);
         *val = s->pm.tmr_val;
         spin_unlock(&s->lock);
-        return 1;
+        return X86EMUL_OKAY;
     }
 
-    return 0;
+    return X86EMUL_UNHANDLEABLE;
 }
 
 static int pmtimer_save(struct domain *d, hvm_domain_context_t *h)
diff --git a/xen/arch/x86/hvm/rtc.c b/xen/arch/x86/hvm/rtc.c
index b9e4b4a241..e196c72866 100644
--- a/xen/arch/x86/hvm/rtc.c
+++ b/xen/arch/x86/hvm/rtc.c
@@ -403,21 +403,21 @@ static int handle_rtc_io(
     if ( bytes != 1 )
     {
         gdprintk(XENLOG_WARNING, "HVM_RTC bas access\n");
-        return 1;
+        return X86EMUL_OKAY;
     }
     
     if ( dir == IOREQ_WRITE )
     {
         if ( rtc_ioport_write(vrtc, port, (uint8_t)*val) )
-            return 1;
+            return X86EMUL_OKAY;
     }
     else if ( vrtc->hw.cmos_index < RTC_CMOS_SIZE )
     {
         *val = rtc_ioport_read(vrtc, port);
-        return 1;
+        return X86EMUL_OKAY;
     }
 
-    return 0;
+    return X86EMUL_UNHANDLEABLE;
 }
 
 void rtc_migrate_timers(struct vcpu *v)
diff --git a/xen/arch/x86/hvm/stdvga.c b/xen/arch/x86/hvm/stdvga.c
index 56260c5c77..25b16bddac 100644
--- a/xen/arch/x86/hvm/stdvga.c
+++ b/xen/arch/x86/hvm/stdvga.c
@@ -32,6 +32,7 @@
 #include <xen/sched.h>
 #include <xen/domain_page.h>
 #include <asm/hvm/support.h>
+#include <xen/numa.h>
 
 #define PAT(x) (x)
 static const uint32_t mask16[16] = {
@@ -166,19 +167,19 @@ static void stdvga_out(uint32_t port, uint32_t bytes, uint32_t val)
     }
 }
 
-int stdvga_intercept_pio(
+static int stdvga_intercept_pio(
     int dir, uint32_t port, uint32_t bytes, uint32_t *val)
 {
     struct hvm_hw_stdvga *s = &current->domain->arch.hvm_domain.stdvga;
 
-    if ( dir == IOREQ_READ )
-        return 0;
-
-    spin_lock(&s->lock);
-    stdvga_out(port, bytes, *val);
-    spin_unlock(&s->lock);
+    if ( dir == IOREQ_WRITE )
+    {
+        spin_lock(&s->lock);
+        stdvga_out(port, bytes, *val);
+        spin_unlock(&s->lock);
+    }
 
-    return 0; /* propagate to external ioemu */
+    return X86EMUL_UNHANDLEABLE; /* propagate to external ioemu */
 }
 
 #define GET_PLANE(data, p) (((data) >> ((p) * 8)) & 0xff)
@@ -458,7 +459,7 @@ static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p)
     return 1;
 }
 
-int stdvga_intercept_mmio(ioreq_t *p)
+static int stdvga_intercept_mmio(ioreq_t *p)
 {
     struct domain *d = current->domain;
     struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga;
@@ -467,7 +468,7 @@ int stdvga_intercept_mmio(ioreq_t *p)
     if ( p->size > 8 )
     {
         gdprintk(XENLOG_WARNING, "invalid mmio size %d\n", (int)p->size);
-        return 0;
+        return X86EMUL_UNHANDLEABLE;
     }
 
     spin_lock(&s->lock);
@@ -498,7 +499,7 @@ int stdvga_intercept_mmio(ioreq_t *p)
 
     spin_unlock(&s->lock);
 
-    return rc;
+    return rc ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
 }
 
 void stdvga_init(struct domain *d)
@@ -513,7 +514,8 @@ void stdvga_init(struct domain *d)
     
     for ( i = 0; i != ARRAY_SIZE(s->vram_page); i++ )
     {
-        if ( (pg = alloc_domheap_page(NULL)) == NULL )
+        pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
+        if ( pg == NULL )
             break;
         s->vram_page[i] = pg;
         p = map_domain_page(page_to_mfn(pg));
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index be166a868c..7c10127966 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -255,11 +255,6 @@ static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
     svm_update_guest_cr(v, 2);
     svm_update_guest_cr(v, 4);
 
-#ifdef HVM_DEBUG_SUSPEND
-    printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
-           __func__, c->cr3, c->cr0, c->cr4);
-#endif
-
     vmcb->sysenter_cs =  c->sysenter_cs;
     vmcb->sysenter_esp = c->sysenter_esp;
     vmcb->sysenter_eip = c->sysenter_eip;
@@ -472,7 +467,7 @@ static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
-    ASSERT(v == current);
+    ASSERT((v == current) || !vcpu_runnable(v));
 
     switch ( seg )
     {
diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
index c01618c69f..8ebaa260cf 100644
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -88,9 +88,9 @@ static unsigned long vioapic_read_indirect(struct hvm_hw_vioapic *vioapic,
     return result;
 }
 
-static unsigned long vioapic_read(struct vcpu *v,
-                                  unsigned long addr,
-                                  unsigned long length)
+static int vioapic_read(
+    struct vcpu *v, unsigned long addr,
+    unsigned long length, unsigned long *pval)
 {
     struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
     uint32_t result;
@@ -114,11 +114,13 @@ static unsigned long vioapic_read(struct vcpu *v,
         break;
     }
 
-    return result;
+    *pval = result;
+    return X86EMUL_OKAY;
 }
 
 static void vioapic_write_redirent(
-    struct hvm_hw_vioapic *vioapic, unsigned int idx, int top_word, uint32_t val)
+    struct hvm_hw_vioapic *vioapic, unsigned int idx,
+    int top_word, uint32_t val)
 {
     struct domain *d = vioapic_domain(vioapic);
     struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
@@ -196,10 +198,9 @@ static void vioapic_write_indirect(
     }
 }
 
-static void vioapic_write(struct vcpu *v,
-                          unsigned long addr,
-                          unsigned long length,
-                          unsigned long val)
+static int vioapic_write(
+    struct vcpu *v, unsigned long addr,
+    unsigned long length, unsigned long val)
 {
     struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
 
@@ -224,6 +225,8 @@ static void vioapic_write(struct vcpu *v,
     default:
         break;
     }
+
+    return X86EMUL_OKAY;
 }
 
 static int vioapic_range(struct vcpu *v, unsigned long addr)
@@ -477,45 +480,16 @@ void vioapic_update_EOI(struct domain *d, int vector)
     spin_unlock(&d->arch.hvm_domain.irq_lock);
 }
 
-#ifdef HVM_DEBUG_SUSPEND
-static void ioapic_info(struct hvm_hw_vioapic *s)
-{
-    int i;
-    printk("*****ioapic state:*****\n");
-    printk("ioapic 0x%x.\n", s->ioregsel);
-    printk("ioapic 0x%x.\n", s->id);
-    printk("ioapic 0x%lx.\n", s->base_address);
-    for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
-        printk("ioapic redirtbl[%d]:0x%"PRIx64"\n", i, s->redirtbl[i].bits);
-    }
-
-}
-#else
-static void ioapic_info(struct hvm_hw_vioapic *s)
-{
-}
-#endif
-
-
 static int ioapic_save(struct domain *d, hvm_domain_context_t *h)
 {
     struct hvm_hw_vioapic *s = domain_vioapic(d);
-    ioapic_info(s);
-
-    /* save io-apic state*/
-    return ( hvm_save_entry(IOAPIC, 0, h, s) );
+    return hvm_save_entry(IOAPIC, 0, h, s);
 }
 
 static int ioapic_load(struct domain *d, hvm_domain_context_t *h)
 {
     struct hvm_hw_vioapic *s = domain_vioapic(d);
-    
-    /* restore ioapic state */
-    if ( hvm_load_entry(IOAPIC, h, s) != 0 )
-        return -EINVAL;
-
-    ioapic_info(s);
-    return 0;
+    return hvm_load_entry(IOAPIC, h, s);
 }
 
 HVM_REGISTER_SAVE_RESTORE(IOAPIC, ioapic_save, ioapic_load, 1, HVMSR_PER_DOM);
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index bf53ba7a1a..9bfc2cc3d1 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -33,6 +33,7 @@
 #include <xen/sched.h>
 #include <asm/current.h>
 #include <asm/hvm/vmx/vmx.h>
+#include <xen/numa.h>
 #include <public/hvm/ioreq.h>
 #include <public/hvm/params.h>
 
@@ -240,12 +241,145 @@ static int vlapic_match_dest(struct vcpu *v, struct vlapic *source,
     return result;
 }
 
+static int vlapic_vcpu_pause_async(struct vcpu *v)
+{
+    vcpu_pause_nosync(v);
+
+    if ( v->is_running )
+    {
+        vcpu_unpause(v);
+        return 0;
+    }
+
+    sync_vcpu_execstate(v);
+    return 1;
+}
+
+static void vlapic_init_action(unsigned long _vcpu)
+{
+    struct vcpu *v = (struct vcpu *)_vcpu;
+    struct domain *d = v->domain;
+
+    /* If the VCPU is not on its way down we have nothing to do. */
+    if ( !test_bit(_VPF_down, &v->pause_flags) )
+        return;
+
+    if ( !vlapic_vcpu_pause_async(v) )
+    {
+        tasklet_schedule(&vcpu_vlapic(v)->init_tasklet);
+        return;
+    }
+
+    domain_lock(d);
+
+    /* Paranoia makes us re-assert VPF_down under the domain lock. */
+    set_bit(_VPF_down, &v->pause_flags);
+    v->is_initialised = 0;
+    clear_bit(_VPF_blocked, &v->pause_flags);
+
+    vlapic_reset(vcpu_vlapic(v));
+
+    domain_unlock(d);
+
+    vcpu_unpause(v);
+}
+
+static int vlapic_accept_init(struct vcpu *v)
+{
+    /* Nothing to do if the VCPU is already reset. */
+    if ( !v->is_initialised )
+        return X86EMUL_OKAY;
+
+    /* Asynchronously take the VCPU down and schedule reset work. */
+    hvm_vcpu_down(v);
+    tasklet_schedule(&vcpu_vlapic(v)->init_tasklet);
+    return X86EMUL_RETRY;
+}
+
+static int vlapic_accept_sipi(struct vcpu *v, int trampoline_vector)
+{
+    struct domain *d = current->domain;
+    struct vcpu_guest_context *ctxt;
+    struct segment_register reg;
+
+    /* If the VCPU is not on its way down we have nothing to do. */
+    if ( !test_bit(_VPF_down, &v->pause_flags) )
+        return X86EMUL_OKAY;
+
+    if ( !vlapic_vcpu_pause_async(v) )
+        return X86EMUL_RETRY;
+
+    domain_lock(d);
+
+    if ( v->is_initialised )
+        goto out;
+
+    ctxt = &v->arch.guest_context;
+    memset(ctxt, 0, sizeof(*ctxt));
+    ctxt->flags = VGCF_online;
+    ctxt->user_regs.eflags = 2;
+
+    v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
+    hvm_update_guest_cr(v, 0);
+
+    v->arch.hvm_vcpu.guest_cr[2] = 0;
+    hvm_update_guest_cr(v, 2);
+
+    v->arch.hvm_vcpu.guest_cr[3] = 0;
+    hvm_update_guest_cr(v, 3);
+
+    v->arch.hvm_vcpu.guest_cr[4] = 0;
+    hvm_update_guest_cr(v, 4);
+
+    v->arch.hvm_vcpu.guest_efer = 0;
+    hvm_update_guest_efer(v);
+
+    reg.sel = trampoline_vector << 8;
+    reg.base = (uint32_t)reg.sel << 4;
+    reg.limit = 0xffff;
+    reg.attr.bytes = 0x89b;
+    hvm_set_segment_register(v, x86_seg_cs, &reg);
+
+    reg.sel = reg.base = 0;
+    reg.limit = 0xffff;
+    reg.attr.bytes = 0x893;
+    hvm_set_segment_register(v, x86_seg_ds, &reg);
+    hvm_set_segment_register(v, x86_seg_es, &reg);
+    hvm_set_segment_register(v, x86_seg_fs, &reg);
+    hvm_set_segment_register(v, x86_seg_gs, &reg);
+    hvm_set_segment_register(v, x86_seg_ss, &reg);
+
+    reg.attr.bytes = 0x82; /* LDT */
+    hvm_set_segment_register(v, x86_seg_ldtr, &reg);
+
+    reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */
+    hvm_set_segment_register(v, x86_seg_tr, &reg);
+
+    reg.attr.bytes = 0;
+    hvm_set_segment_register(v, x86_seg_gdtr, &reg);
+    hvm_set_segment_register(v, x86_seg_idtr, &reg);
+
+    /* Sync AP's TSC with BSP's. */
+    v->arch.hvm_vcpu.cache_tsc_offset =
+        v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset;
+    hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
+
+    v->arch.flags |= TF_kernel_mode;
+    v->is_initialised = 1;
+    clear_bit(_VPF_down, &v->pause_flags);
+
+ out:
+    domain_unlock(d);
+    vcpu_unpause(v);
+    return X86EMUL_OKAY;
+}
+
 /* Add a pending IRQ into lapic. */
 static int vlapic_accept_irq(struct vcpu *v, int delivery_mode,
                              int vector, int level, int trig_mode)
 {
-    int result = 0;
     struct vlapic *vlapic = vcpu_vlapic(v);
+    int rc = X86EMUL_OKAY;
 
     switch ( delivery_mode )
     {
@@ -270,8 +404,6 @@ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode,
         }
 
         vcpu_kick(v);
-
-        result = 1;
         break;
 
     case APIC_DM_REMRD:
@@ -291,43 +423,20 @@ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode,
         /* No work on INIT de-assert for P4-type APIC. */
         if ( trig_mode && !(level & APIC_INT_ASSERT) )
             break;
-        /* FIXME How to check the situation after vcpu reset? */
-        if ( v->is_initialised )
-            hvm_vcpu_reset(v);
-        v->arch.hvm_vcpu.init_sipi_sipi_state =
-            HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
-        result = 1;
+        rc = vlapic_accept_init(v);
         break;
 
     case APIC_DM_STARTUP:
-        if ( v->arch.hvm_vcpu.init_sipi_sipi_state ==
-             HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM )
-            break;
-
-        v->arch.hvm_vcpu.init_sipi_sipi_state =
-            HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM;
-
-        if ( v->is_initialised )
-        {
-            gdprintk(XENLOG_ERR, "SIPI for initialized vcpu %x\n", v->vcpu_id);
-            goto exit_and_crash;
-        }
-
-        if ( hvm_bringup_ap(v->vcpu_id, vector) != 0 )
-            result = 0;
+        rc = vlapic_accept_sipi(v, vector);
         break;
 
     default:
         gdprintk(XENLOG_ERR, "TODO: unsupported delivery mode %x\n",
                  delivery_mode);
-        goto exit_and_crash;
+        domain_crash(v->domain);
     }
 
-    return result;
-
- exit_and_crash:
-    domain_crash(v->domain);
-    return 0;
+    return rc;
 }
 
 /* This function is used by both ioapic and lapic.The bitmap is for vcpu_id. */
@@ -369,11 +478,9 @@ void vlapic_EOI_set(struct vlapic *vlapic)
         vioapic_update_EOI(vlapic_domain(vlapic), vector);
 }
 
-static void vlapic_ipi(struct vlapic *vlapic)
+static int vlapic_ipi(
+    struct vlapic *vlapic, uint32_t icr_low, uint32_t icr_high)
 {
-    uint32_t icr_low = vlapic_get_reg(vlapic, APIC_ICR);
-    uint32_t icr_high = vlapic_get_reg(vlapic, APIC_ICR2);
-
     unsigned int dest =         GET_APIC_DEST_FIELD(icr_high);
     unsigned int short_hand =   icr_low & APIC_SHORT_MASK;
     unsigned int trig_mode =    icr_low & APIC_INT_LEVELTRIG;
@@ -385,6 +492,7 @@ static void vlapic_ipi(struct vlapic *vlapic)
     struct vlapic *target;
     struct vcpu *v;
     uint32_t lpr_map = 0;
+    int rc = X86EMUL_OKAY;
 
     HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "icr_high 0x%x, icr_low 0x%x, "
                 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
@@ -399,18 +507,23 @@ static void vlapic_ipi(struct vlapic *vlapic)
             if ( delivery_mode == APIC_DM_LOWEST )
                 __set_bit(v->vcpu_id, &lpr_map);
             else
-                vlapic_accept_irq(v, delivery_mode,
-                                  vector, level, trig_mode);
+                rc = vlapic_accept_irq(v, delivery_mode,
+                                       vector, level, trig_mode);
         }
+
+        if ( rc != X86EMUL_OKAY )
+            break;
     }
 
     if ( delivery_mode == APIC_DM_LOWEST )
     {
         target = apic_round_robin(vlapic_domain(v), vector, lpr_map);
         if ( target != NULL )
-            vlapic_accept_irq(vlapic_vcpu(target), delivery_mode,
-                              vector, level, trig_mode);
+            rc = vlapic_accept_irq(vlapic_vcpu(target), delivery_mode,
+                                   vector, level, trig_mode);
     }
+
+    return rc;
 }
 
 static uint32_t vlapic_get_tmcct(struct vlapic *vlapic)
@@ -465,17 +578,18 @@ static void vlapic_read_aligned(
     }
 }
 
-static unsigned long vlapic_read(struct vcpu *v, unsigned long address,
-                                 unsigned long len)
+static int vlapic_read(
+    struct vcpu *v, unsigned long address,
+    unsigned long len, unsigned long *pval)
 {
     unsigned int alignment;
     unsigned int tmp;
-    unsigned long result;
+    unsigned long result = 0;
     struct vlapic *vlapic = vcpu_vlapic(v);
     unsigned int offset = address - vlapic_base_address(vlapic);
 
     if ( offset > (APIC_TDCR + 0x3) )
-        return 0;
+        goto out;
 
     alignment = offset & 0x3;
 
@@ -507,14 +621,16 @@ static unsigned long vlapic_read(struct vcpu *v, unsigned long address,
     HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset 0x%x with length 0x%lx, "
                 "and the result is 0x%lx", offset, len, result);
 
-    return result;
+ out:
+    *pval = result;
+    return X86EMUL_OKAY;
 
  unaligned_exit_and_crash:
     gdprintk(XENLOG_ERR, "Unaligned LAPIC read len=0x%lx at offset=0x%x.\n",
              len, offset);
  exit_and_crash:
     domain_crash(v->domain);
-    return 0;
+    return X86EMUL_OKAY;
 }
 
 void vlapic_pt_cb(struct vcpu *v, void *data)
@@ -522,11 +638,12 @@ void vlapic_pt_cb(struct vcpu *v, void *data)
     *(s_time_t *)data = hvm_get_guest_time(v);
 }
 
-static void vlapic_write(struct vcpu *v, unsigned long address,
-                         unsigned long len, unsigned long val)
+static int vlapic_write(struct vcpu *v, unsigned long address,
+                        unsigned long len, unsigned long val)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
     unsigned int offset = address - vlapic_base_address(vlapic);
+    int rc = X86EMUL_OKAY;
 
     if ( offset != 0xb0 )
         HVM_DBG_LOG(DBG_LEVEL_VLAPIC,
@@ -540,13 +657,13 @@ static void vlapic_write(struct vcpu *v, unsigned long address,
     val = (uint32_t)val;
     if ( len != 4 )
     {
-        unsigned int tmp;
+        unsigned long tmp;
         unsigned char alignment;
 
         gdprintk(XENLOG_INFO, "Notice: Local APIC write with len = %lx\n",len);
 
         alignment = offset & 0x3;
-        tmp = vlapic_read(v, offset & ~0x3, 4);
+        (void)vlapic_read(v, offset & ~0x3, 4, &tmp);
 
         switch ( len )
         {
@@ -617,9 +734,10 @@ static void vlapic_write(struct vcpu *v, unsigned long address,
         break;
 
     case APIC_ICR:
-        /* No delay here, so we always clear the pending bit*/
-        vlapic_set_reg(vlapic, APIC_ICR, val & ~(1 << 12));
-        vlapic_ipi(vlapic);
+        val &= ~(1 << 12); /* always clear the pending bit */
+        rc = vlapic_ipi(vlapic, val, vlapic_get_reg(vlapic, APIC_ICR2));
+        if ( rc == X86EMUL_OKAY )
+            vlapic_set_reg(vlapic, APIC_ICR, val);
         break;
 
     case APIC_ICR2:
@@ -669,13 +787,14 @@ static void vlapic_write(struct vcpu *v, unsigned long address,
         break;
     }
 
-    return;
+    return rc;
 
  unaligned_exit_and_crash:
     gdprintk(XENLOG_ERR, "Unaligned LAPIC write len=0x%lx at offset=0x%x.\n",
              len, offset);
  exit_and_crash:
     domain_crash(v->domain);
+    return rc;
 }
 
 static int vlapic_range(struct vcpu *v, unsigned long addr)
@@ -788,77 +907,58 @@ void vlapic_reset(struct vlapic *vlapic)
 
     vlapic_set_reg(vlapic, APIC_SPIV, 0xff);
     vlapic->hw.disabled |= VLAPIC_SW_DISABLED;
-}
 
-#ifdef HVM_DEBUG_SUSPEND
-static void lapic_info(struct vlapic *s)
-{
-    printk("*****lapic state:*****\n");
-    printk("lapic 0x%"PRIx64".\n", s->hw.apic_base_msr);
-    printk("lapic 0x%x.\n", s->hw.disabled);
-    printk("lapic 0x%x.\n", s->hw.timer_divisor);
-}
-#else
-static void lapic_info(struct vlapic *s)
-{
+    destroy_periodic_time(&vlapic->pt);
 }
-#endif
 
 /* rearm the actimer if needed, after a HVM restore */
 static void lapic_rearm(struct vlapic *s)
 {
-    unsigned long tmict;
+    unsigned long tmict = vlapic_get_reg(s, APIC_TMICT);
+    uint64_t period;
 
-    tmict = vlapic_get_reg(s, APIC_TMICT);
-    if ( tmict > 0 )
-    {
-        uint64_t period = (uint64_t)APIC_BUS_CYCLE_NS *
-                            (uint32_t)tmict * s->hw.timer_divisor;
-        uint32_t lvtt = vlapic_get_reg(s, APIC_LVTT);
-
-        s->pt.irq = lvtt & APIC_VECTOR_MASK;
-        create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
-                             !vlapic_lvtt_period(s), vlapic_pt_cb,
-                             &s->timer_last_update);
-        s->timer_last_update = s->pt.last_plt_gtime;
-
-        printk("lapic_load to rearm the actimer:"
-               "bus cycle is %uns, "
-               "saved tmict count %lu, period %"PRIu64"ns, irq=%"PRIu8"\n",
-               APIC_BUS_CYCLE_NS, tmict, period, s->pt.irq);
-    }
+    if ( (tmict = vlapic_get_reg(s, APIC_TMICT)) == 0 )
+        return;
 
-    lapic_info(s);
+    period = ((uint64_t)APIC_BUS_CYCLE_NS *
+              (uint32_t)tmict * s->hw.timer_divisor);
+    s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK;
+    create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
+                         !vlapic_lvtt_period(s), vlapic_pt_cb,
+                         &s->timer_last_update);
+    s->timer_last_update = s->pt.last_plt_gtime;
 }
 
 static int lapic_save_hidden(struct domain *d, hvm_domain_context_t *h)
 {
     struct vcpu *v;
     struct vlapic *s;
+    int rc = 0;
 
-    for_each_vcpu(d, v)
+    for_each_vcpu ( d, v )
     {
         s = vcpu_vlapic(v);
-        lapic_info(s);
-
-        if ( hvm_save_entry(LAPIC, v->vcpu_id, h, &s->hw) != 0 )
-            return 1; 
+        if ( (rc = hvm_save_entry(LAPIC, v->vcpu_id, h, &s->hw)) != 0 )
+            break;
     }
-    return 0;
+
+    return rc;
 }
 
 static int lapic_save_regs(struct domain *d, hvm_domain_context_t *h)
 {
     struct vcpu *v;
     struct vlapic *s;
+    int rc = 0;
 
-    for_each_vcpu(d, v)
+    for_each_vcpu ( d, v )
     {
         s = vcpu_vlapic(v);
-        if ( hvm_save_entry(LAPIC_REGS, v->vcpu_id, h, s->regs) != 0 )
-            return 1; 
+        if ( (rc = hvm_save_entry(LAPIC_REGS, v->vcpu_id, h, s->regs)) != 0 )
+            break;
     }
-    return 0;
+
+    return rc;
 }
 
 static int lapic_load_hidden(struct domain *d, hvm_domain_context_t *h)
@@ -879,8 +979,6 @@ static int lapic_load_hidden(struct domain *d, hvm_domain_context_t *h)
     if ( hvm_load_entry(LAPIC, h, &s->hw) != 0 ) 
         return -EINVAL;
 
-    lapic_info(s);
-
     vmx_vlapic_msr_changed(v);
 
     return 0;
@@ -916,7 +1014,7 @@ HVM_REGISTER_SAVE_RESTORE(LAPIC_REGS, lapic_save_regs, lapic_load_regs,
 int vlapic_init(struct vcpu *v)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
-    unsigned int memflags = 0;
+    unsigned int memflags = MEMF_node(vcpu_to_node(v));
 
     HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "%d", v->vcpu_id);
 
@@ -925,10 +1023,10 @@ int vlapic_init(struct vcpu *v)
 #ifdef __i386__
     /* 32-bit VMX may be limited to 32-bit physical addresses. */
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-        memflags = MEMF_bits(32);
+        memflags |= MEMF_bits(32);
 #endif
 
-    vlapic->regs_page = alloc_domheap_pages(NULL, 0, memflags);
+    vlapic->regs_page = alloc_domheap_page(NULL, memflags);
     if ( vlapic->regs_page == NULL )
     {
         dprintk(XENLOG_ERR, "alloc vlapic regs error: %d/%d\n",
@@ -941,7 +1039,7 @@ int vlapic_init(struct vcpu *v)
     {
         dprintk(XENLOG_ERR, "map vlapic regs error: %d/%d\n",
                 v->domain->domain_id, v->vcpu_id);
-	return -ENOMEM;
+        return -ENOMEM;
     }
 
     clear_page(vlapic->regs);
@@ -953,6 +1051,8 @@ int vlapic_init(struct vcpu *v)
     if ( v->vcpu_id == 0 )
         vlapic->hw.apic_base_msr |= MSR_IA32_APICBASE_BSP;
 
+    tasklet_init(&vlapic->init_tasklet, vlapic_init_action, (unsigned long)v);
+
     return 0;
 }
 
@@ -960,6 +1060,7 @@ void vlapic_destroy(struct vcpu *v)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
 
+    tasklet_kill(&vlapic->init_tasklet);
     destroy_periodic_time(&vlapic->pt);
     unmap_domain_page_global(vlapic->regs);
     free_domheap_page(vlapic->regs_page);
diff --git a/xen/arch/x86/hvm/vmx/realmode.c b/xen/arch/x86/hvm/vmx/realmode.c
index c00e8b1e42..5d13f4e60b 100644
--- a/xen/arch/x86/hvm/vmx/realmode.c
+++ b/xen/arch/x86/hvm/vmx/realmode.c
@@ -172,7 +172,7 @@ static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
              hvmemul_ctxt->insn_buf[0], hvmemul_ctxt->insn_buf[1],
              hvmemul_ctxt->insn_buf[2], hvmemul_ctxt->insn_buf[3],
              hvmemul_ctxt->insn_buf[4], hvmemul_ctxt->insn_buf[5]);
-    domain_crash_synchronous();
+    domain_crash(curr->domain);
 }
 
 void vmx_realmode(struct cpu_user_regs *regs)
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index bee9eb1deb..48506c5b32 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -38,6 +38,9 @@
 #include <asm/shadow.h>
 #include <asm/tboot.h>
 
+static int opt_vpid_enabled = 1;
+boolean_param("vpid", opt_vpid_enabled);
+
 /* Dynamic (run-time adjusted) execution control flags. */
 u32 vmx_pin_based_exec_control __read_mostly;
 u32 vmx_cpu_based_exec_control __read_mostly;
@@ -84,14 +87,16 @@ static void vmx_init_vmcs_config(void)
 
     min = (CPU_BASED_HLT_EXITING |
            CPU_BASED_INVLPG_EXITING |
+           CPU_BASED_CR3_LOAD_EXITING |
+           CPU_BASED_CR3_STORE_EXITING |
            CPU_BASED_MONITOR_EXITING |
            CPU_BASED_MWAIT_EXITING |
            CPU_BASED_MOV_DR_EXITING |
            CPU_BASED_ACTIVATE_IO_BITMAP |
            CPU_BASED_USE_TSC_OFFSETING);
-    opt  = CPU_BASED_ACTIVATE_MSR_BITMAP;
-    opt |= CPU_BASED_TPR_SHADOW;
-    opt |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+    opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
+           CPU_BASED_TPR_SHADOW |
+           CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
     _vmx_cpu_based_exec_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
 #ifdef __x86_64__
@@ -107,11 +112,25 @@ static void vmx_init_vmcs_config(void)
     {
         min = 0;
         opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-               SECONDARY_EXEC_WBINVD_EXITING);
+               SECONDARY_EXEC_WBINVD_EXITING |
+               SECONDARY_EXEC_ENABLE_EPT);
+        if ( opt_vpid_enabled )
+            opt |= SECONDARY_EXEC_ENABLE_VPID;
         _vmx_secondary_exec_control = adjust_vmx_controls(
             min, opt, MSR_IA32_VMX_PROCBASED_CTLS2);
     }
 
+    if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
+    {
+        /* To use EPT we expect to be able to clear certain intercepts. */
+        uint32_t must_be_one, must_be_zero;
+        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, must_be_one, must_be_zero);
+        if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
+                            CPU_BASED_CR3_LOAD_EXITING |
+                            CPU_BASED_CR3_STORE_EXITING) )
+            _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+    }
+
 #if defined(__i386__)
     /* If we can't virtualise APIC accesses, the TPR shadow is pointless. */
     if ( !(_vmx_secondary_exec_control &
@@ -301,6 +320,10 @@ int vmx_cpu_up(void)
         return 0;
     }
 
+    ept_sync_all();
+
+    vpid_sync_all();
+
     return 1;
 }
 
@@ -439,6 +462,7 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr)
 
 static int construct_vmcs(struct vcpu *v)
 {
+    struct domain *d = v->domain;
     uint16_t sysenter_cs;
     unsigned long sysenter_eip;
 
@@ -448,10 +472,25 @@ static int construct_vmcs(struct vcpu *v)
     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
     __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
     __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
-    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control);
+
     v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
-    if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
-        __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control);
+    v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
+
+    if ( paging_mode_hap(d) )
+    {
+        v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+                                          CPU_BASED_CR3_LOAD_EXITING |
+                                          CPU_BASED_CR3_STORE_EXITING);
+    }
+    else
+    {
+        v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+    }
+
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+    if ( cpu_has_vmx_secondary_exec_control )
+        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+                  v->arch.hvm_vmx.secondary_exec_control);
 
     /* MSR access bitmap. */
     if ( cpu_has_vmx_msr_bitmap )
@@ -570,9 +609,10 @@ static int construct_vmcs(struct vcpu *v)
     __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL);
 #endif
 
-    __vmwrite(EXCEPTION_BITMAP, (HVM_TRAP_MASK |
-                                 (1U << TRAP_page_fault) |
-                                 (1U << TRAP_no_device)));
+    __vmwrite(EXCEPTION_BITMAP,
+              HVM_TRAP_MASK
+              | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
+              | (1U << TRAP_no_device));
 
     v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
     hvm_update_guest_cr(v, 0);
@@ -587,6 +627,22 @@ static int construct_vmcs(struct vcpu *v)
         __vmwrite(TPR_THRESHOLD, 0);
     }
 
+    if ( paging_mode_hap(d) )
+    {
+        __vmwrite(EPT_POINTER, d->arch.hvm_domain.vmx.ept_control.eptp);
+#ifdef CONFIG_X86_PAE
+        __vmwrite(EPT_POINTER_HIGH,
+                  d->arch.hvm_domain.vmx.ept_control.eptp >> 32);
+#endif
+    }
+
+    if ( cpu_has_vmx_vpid )
+    {
+        v->arch.hvm_vmx.vpid =
+            v->domain->arch.hvm_domain.vmx.vpid_base + v->vcpu_id;
+        __vmwrite(VIRTUAL_PROCESSOR_ID, v->arch.hvm_vmx.vpid);
+    }
+
     vmx_vmcs_exit(v);
 
     paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
@@ -729,14 +785,14 @@ void vmx_destroy_vmcs(struct vcpu *v)
     arch_vmx->vmcs = NULL;
 }
 
-void vm_launch_fail(unsigned long eflags)
+void vm_launch_fail(void)
 {
     unsigned long error = __vmread(VM_INSTRUCTION_ERROR);
     printk("<vm_launch_fail> error code %lx\n", error);
     domain_crash_synchronous();
 }
 
-void vm_resume_fail(unsigned long eflags)
+void vm_resume_fail(void)
 {
     unsigned long error = __vmread(VM_INSTRUCTION_ERROR);
     printk("<vm_resume_fail> error code %lx\n", error);
@@ -780,6 +836,7 @@ void vmx_do_resume(struct vcpu *v)
         vmx_load_vmcs(v);
         hvm_migrate_timers(v);
         vmx_set_host_env(v);
+        vpid_sync_vcpu_all(v);
     }
 
     debug_state = v->domain->debugger_attached;
@@ -932,6 +989,10 @@ void vmcs_dump_vcpu(struct vcpu *v)
            (uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
     printk("TPR Threshold = 0x%02x\n",
            (uint32_t)vmr(TPR_THRESHOLD));
+    printk("EPT pointer = 0x%08x%08x\n",
+           (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER));
+    printk("Virtual processor ID = 0x%04x\n",
+           (uint32_t)vmr(VIRTUAL_PROCESSOR_ID));
 
     vmx_vmcs_exit(v);
 }
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 29dcb68503..628cbddfcf 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -57,6 +57,8 @@ static void vmx_ctxt_switch_to(struct vcpu *v);
 
 static int  vmx_alloc_vlapic_mapping(struct domain *d);
 static void vmx_free_vlapic_mapping(struct domain *d);
+static int  vmx_alloc_vpid(struct domain *d);
+static void vmx_free_vpid(struct domain *d);
 static void vmx_install_vlapic_mapping(struct vcpu *v);
 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr);
 static void vmx_update_guest_efer(struct vcpu *v);
@@ -71,12 +73,30 @@ static void vmx_invlpg_intercept(unsigned long vaddr);
 
 static int vmx_domain_initialise(struct domain *d)
 {
-    return vmx_alloc_vlapic_mapping(d);
+    int rc;
+
+    d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
+    d->arch.hvm_domain.vmx.ept_control.gaw  = EPT_DEFAULT_GAW;
+    d->arch.hvm_domain.vmx.ept_control.asr  =
+        pagetable_get_pfn(d->arch.phys_table);
+
+    if ( (rc = vmx_alloc_vpid(d)) != 0 )
+        return rc;
+
+    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
+    {
+        vmx_free_vpid(d);
+        return rc;
+    }
+
+    return 0;
 }
 
 static void vmx_domain_destroy(struct domain *d)
 {
+    ept_sync_domain(d);
     vmx_free_vlapic_mapping(d);
+    vmx_free_vpid(d);
 }
 
 static int vmx_vcpu_initialise(struct vcpu *v)
@@ -492,20 +512,23 @@ static int vmx_restore_cr0_cr3(
     unsigned long mfn = 0;
     p2m_type_t p2mt;
 
-    if ( cr0 & X86_CR0_PG )
+    if ( paging_mode_shadow(v->domain) )
     {
-        mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
-        if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+        if ( cr0 & X86_CR0_PG )
         {
-            gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
-            return -EINVAL;
+            mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+            if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+            {
+                gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
+                return -EINVAL;
+            }
         }
-    }
 
-    if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
-        put_page(pagetable_get_page(v->arch.guest_table));
+        if ( hvm_paging_enabled(v) )
+            put_page(pagetable_get_page(v->arch.guest_table));
 
-    v->arch.guest_table = pagetable_from_pfn(mfn);
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+    }
 
     v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
     v->arch.hvm_vcpu.guest_cr[3] = cr3;
@@ -538,11 +561,6 @@ static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
     vmx_update_guest_cr(v, 2);
     vmx_update_guest_cr(v, 4);
 
-#ifdef HVM_DEBUG_SUSPEND
-    printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
-           __func__, c->cr3, c->cr0, c->cr4);
-#endif
-
     v->arch.hvm_vcpu.guest_efer = c->msr_efer;
     vmx_update_guest_efer(v);
 
@@ -573,20 +591,6 @@ static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c)
     return 0;
 }
 
-#if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND)
-static void dump_msr_state(struct vmx_msr_state *m)
-{
-    int i = 0;
-    printk("**** msr state ****\n");
-    printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
-    for ( i = 0; i < VMX_MSR_COUNT; i++ )
-        printk("0x%lx,", m->msrs[i]);
-    printk("\n");
-}
-#else
-#define dump_msr_state(m) ((void)0)
-#endif
-
 static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
 {
 #ifdef __x86_64__
@@ -604,8 +608,6 @@ static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
 #endif
 
     data->tsc = hvm_get_guest_time(v);
-
-    dump_msr_state(guest_state);
 }
 
 static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
@@ -624,8 +626,6 @@ static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
 #endif
 
     hvm_set_guest_time(v, data->tsc);
-
-    dump_msr_state(guest_state);
 }
 
 
@@ -900,6 +900,56 @@ static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
     __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
 }
 
+static void vmx_load_pdptrs(struct vcpu *v)
+{
+    unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
+    uint64_t *guest_pdptrs;
+    p2m_type_t p2mt;
+    char *p;
+
+    /* EPT needs to load PDPTRS into VMCS for PAE. */
+    if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
+        return;
+
+    if ( cr3 & 0x1fUL )
+        goto crash;
+
+    mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+    if ( !p2m_is_ram(p2mt) )
+        goto crash;
+
+    p = map_domain_page(mfn);
+
+    guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
+
+    /*
+     * We do not check the PDPTRs for validity. The CPU will do this during
+     * vm entry, and we can handle the failure there and crash the guest.
+     * The only thing we could do better here is #GP instead.
+     */
+
+    vmx_vmcs_enter(v);
+
+    __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
+    __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
+    __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
+    __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
+#ifdef CONFIG_X86_PAE
+    __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
+    __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
+    __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
+    __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
+#endif
+
+    vmx_vmcs_exit(v);
+
+    unmap_domain_page(p);
+    return;
+
+ crash:
+    domain_crash(v->domain);
+}
+
 static void vmx_update_host_cr3(struct vcpu *v)
 {
     vmx_vmcs_enter(v);
@@ -915,7 +965,24 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
     {
     case 0: {
         unsigned long hw_cr0_mask =
-            X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
+            X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
+
+        if ( paging_mode_shadow(v->domain) )
+           hw_cr0_mask |= X86_CR0_WP;
+
+        if ( paging_mode_hap(v->domain) )
+        {
+            /* We manage GUEST_CR3 when guest CR0.PE is zero. */
+            uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
+                                 CPU_BASED_CR3_STORE_EXITING);
+            v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
+            if ( !hvm_paging_enabled(v) )
+                v->arch.hvm_vmx.exec_control |= cr3_ctls;
+            __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+
+            /* Changing CR0.PE can change some bits in real CR4. */
+            vmx_update_guest_cr(v, 4);
+        }
 
         if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
         {
@@ -939,11 +1006,27 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
         /* CR2 is updated in exit stub. */
         break;
     case 3:
+        if ( paging_mode_hap(v->domain) )
+        {
+            if ( !hvm_paging_enabled(v) )
+                v->arch.hvm_vcpu.hw_cr[3] =
+                    v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
+            vmx_load_pdptrs(v);
+        }
+ 
         __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
+        vpid_sync_vcpu_all(v);
         break;
     case 4:
-        v->arch.hvm_vcpu.hw_cr[4] =
-            v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
+        v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
+        if ( paging_mode_hap(v->domain) )
+            v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+        v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+        if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
+        {
+            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
+            v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+        }
         __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
         __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
         break;
@@ -978,12 +1061,29 @@ static void vmx_update_guest_efer(struct vcpu *v)
 
 static void vmx_flush_guest_tlbs(void)
 {
-    /* No tagged TLB support on VMX yet.  The fact that we're in Xen
-     * at all means any guest will have a clean TLB when it's next run,
-     * because VMRESUME will flush it for us. */
+    /*
+     * If VPID (i.e. tagged TLB support) is not enabled, the fact that
+     * we're in Xen at all means any guest will have a clean TLB when
+     * it's next run, because VMRESUME will flush it for us.
+     *
+     * If enabled, we invalidate all translations associated with all
+     * VPID values.
+     */
+    vpid_sync_all();
 }
 
+static void __ept_sync_domain(void *info)
+{
+    struct domain *d = info;
+    __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
+}
 
+void ept_sync_domain(struct domain *d)
+{
+    /* Only if using EPT and this domain has some VCPUs to dirty. */
+    if ( d->arch.hvm_domain.hap_enabled && d->vcpu[0] )
+        on_each_cpu(__ept_sync_domain, d, 1, 1);
+}
 
 static void __vmx_inject_exception(
     struct vcpu *v, int trap, int type, int error_code)
@@ -1100,6 +1200,9 @@ static struct hvm_function_table vmx_function_table = {
     .invlpg_intercept     = vmx_invlpg_intercept
 };
 
+static unsigned long *vpid_bitmap;
+#define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / MAX_VIRT_CPUS)
+
 void start_vmx(void)
 {
     static int bootstrapped;
@@ -1133,6 +1236,25 @@ void start_vmx(void)
         return;
     }
 
+    if ( cpu_has_vmx_ept )
+    {
+        printk("VMX: EPT is available.\n");
+        vmx_function_table.hap_supported = 1;
+    }
+
+    if ( cpu_has_vmx_vpid )
+    {
+        printk("VMX: VPID is available.\n");
+
+        vpid_bitmap = xmalloc_array(
+            unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE));
+        BUG_ON(vpid_bitmap == NULL);
+        memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long));
+
+        /* VPID 0 is used by VMX root mode (the hypervisor). */
+        __set_bit(0, vpid_bitmap);
+    }
+
     setup_vmcs_dump();
 
     hvm_enable(&vmx_function_table);
@@ -1635,18 +1757,47 @@ static int vmx_alloc_vlapic_mapping(struct domain *d)
     share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
     set_mmio_p2m_entry(
         d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
-    d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
+    d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
 
     return 0;
 }
 
 static void vmx_free_vlapic_mapping(struct domain *d)
 {
-    unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
+    unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
     if ( mfn != 0 )
         free_xenheap_page(mfn_to_virt(mfn));
 }
 
+static int vmx_alloc_vpid(struct domain *d)
+{
+    int idx;
+
+    if ( !cpu_has_vmx_vpid )
+        return 0;
+
+    do {
+        idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE);
+        if ( idx >= VPID_BITMAP_SIZE )
+        {
+            dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n");
+            return -EBUSY;
+        }
+    }
+    while ( test_and_set_bit(idx, vpid_bitmap) );
+
+    d->arch.hvm_domain.vmx.vpid_base = idx * MAX_VIRT_CPUS;
+    return 0;
+}
+
+static void vmx_free_vpid(struct domain *d)
+{
+    if ( !cpu_has_vmx_vpid )
+        return;
+
+    clear_bit(d->arch.hvm_domain.vmx.vpid_base / MAX_VIRT_CPUS, vpid_bitmap);
+}
+
 static void vmx_install_vlapic_mapping(struct vcpu *v)
 {
     paddr_t virt_page_ma, apic_page_ma;
@@ -1655,7 +1806,7 @@ static void vmx_install_vlapic_mapping(struct vcpu *v)
         return;
 
     virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
-    apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
+    apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
     apic_page_ma <<= PAGE_SHIFT;
 
     vmx_vmcs_enter(v);
@@ -1900,6 +2051,51 @@ static void vmx_wbinvd_intercept(void)
         wbinvd();
 }
 
+static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
+{
+    unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
+    struct domain *d = current->domain;
+    unsigned long gfn = gpa >> PAGE_SHIFT;
+    mfn_t mfn;
+    p2m_type_t t;
+
+    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
+    {
+        gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr
+                 " exceeded its width limit.\n", gpa);
+        goto crash;
+    }
+
+    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) ||
+         unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) )
+    {
+        gdprintk(XENLOG_ERR, "EPT violation: reserved bit or "
+                 "pdptr load violation.\n");
+        goto crash;
+    }
+
+    mfn = gfn_to_mfn(d, gfn, &t);
+    if ( p2m_is_ram(t) && paging_mode_log_dirty(d) )
+    {
+        paging_mark_dirty(d, mfn_x(mfn));
+        p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
+        flush_tlb_mask(d->domain_dirty_cpumask);
+        return;
+    }
+
+    /* This can only happen in log-dirty mode, writing back A/D bits. */
+    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) )
+        goto crash;
+
+    ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH);
+    handle_mmio();
+
+    return;
+
+ crash:
+    domain_crash(d);
+}
+
 static void vmx_failed_vmentry(unsigned int exit_reason,
                                struct cpu_user_regs *regs)
 {
@@ -1939,6 +2135,10 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
     unsigned long exit_qualification, inst_len = 0;
     struct vcpu *v = current;
 
+    if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
+        v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
+            __vmread(GUEST_CR3);
+
     exit_reason = __vmread(VM_EXIT_REASON);
 
     hvmtrace_vmexit(v, regs->eip, exit_reason);
@@ -2171,6 +2371,17 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
         break;
     }
 
+    case EXIT_REASON_EPT_VIOLATION:
+    {
+        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+#ifdef CONFIG_X86_PAE
+        gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
+#endif
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        ept_handle_violation(exit_qualification, gpa);
+        break;
+    }
+
     default:
     exit_and_crash:
         gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
diff --git a/xen/arch/x86/hvm/vmx/x86_32/exits.S b/xen/arch/x86/hvm/vmx/x86_32/exits.S
index 11db8cfc21..eff089a112 100644
--- a/xen/arch/x86/hvm/vmx/x86_32/exits.S
+++ b/xen/arch/x86/hvm/vmx/x86_32/exits.S
@@ -129,7 +129,6 @@ ENTRY(vmx_asm_do_vmentry)
 /*vmx_resume:*/
         HVM_RESTORE_ALL_NOSEGREGS
         VMRESUME
-        pushf
         call vm_resume_fail
         ud2
 
@@ -137,7 +136,6 @@ vmx_launch:
         movb $1,VCPU_vmx_launched(%ebx)
         HVM_RESTORE_ALL_NOSEGREGS
         VMLAUNCH
-        pushf
         call vm_launch_fail
         ud2
 
diff --git a/xen/arch/x86/hvm/vmx/x86_64/exits.S b/xen/arch/x86/hvm/vmx/x86_64/exits.S
index 48da4869bd..56fdb8ad54 100644
--- a/xen/arch/x86/hvm/vmx/x86_64/exits.S
+++ b/xen/arch/x86/hvm/vmx/x86_64/exits.S
@@ -148,7 +148,6 @@ ENTRY(vmx_asm_do_vmentry)
 /*vmx_resume:*/
         HVM_RESTORE_ALL_NOSEGREGS
         VMRESUME
-        pushfq
         call vm_resume_fail
         ud2
 
@@ -156,7 +155,6 @@ vmx_launch:
         movb $1,VCPU_vmx_launched(%rbx)
         HVM_RESTORE_ALL_NOSEGREGS
         VMLAUNCH
-        pushfq
         call vm_launch_fail
         ud2
 
diff --git a/xen/arch/x86/hvm/vpic.c b/xen/arch/x86/hvm/vpic.c
index ce3943eaab..a3d6f2d9ca 100644
--- a/xen/arch/x86/hvm/vpic.c
+++ b/xen/arch/x86/hvm/vpic.c
@@ -319,7 +319,7 @@ static int vpic_intercept_pic_io(
     if ( bytes != 1 )
     {
         gdprintk(XENLOG_WARNING, "PIC_IO bad access size %d\n", bytes);
-        return 1;
+        return X86EMUL_OKAY;
     }
 
     vpic = &current->domain->arch.hvm_domain.vpic[port >> 7];
@@ -329,7 +329,7 @@ static int vpic_intercept_pic_io(
     else
         *val = (uint8_t)vpic_ioport_read(vpic, port);
 
-    return 1;
+    return X86EMUL_OKAY;
 }
 
 static int vpic_intercept_elcr_io(
@@ -338,11 +338,7 @@ static int vpic_intercept_elcr_io(
     struct hvm_hw_vpic *vpic;
     uint32_t data;
 
-    if ( bytes != 1 )
-    {
-        gdprintk(XENLOG_WARNING, "PIC_IO bad access size %d\n", bytes);
-        return 1;
-    }
+    BUG_ON(bytes != 1);
 
     vpic = &current->domain->arch.hvm_domain.vpic[port & 1];
 
@@ -360,34 +356,8 @@ static int vpic_intercept_elcr_io(
         *val = vpic->elcr & vpic_elcr_mask(vpic);
     }
 
-    return 1;
-}
-
-#ifdef HVM_DEBUG_SUSPEND
-static void vpic_info(struct hvm_hw_vpic *s)
-{
-    printk("*****pic state:*****\n");
-    printk("pic 0x%x.\n", s->irr);
-    printk("pic 0x%x.\n", s->imr);
-    printk("pic 0x%x.\n", s->isr);
-    printk("pic 0x%x.\n", s->irq_base);
-    printk("pic 0x%x.\n", s->init_state);
-    printk("pic 0x%x.\n", s->priority_add);
-    printk("pic 0x%x.\n", s->readsel_isr);
-    printk("pic 0x%x.\n", s->poll);
-    printk("pic 0x%x.\n", s->auto_eoi);
-    printk("pic 0x%x.\n", s->rotate_on_auto_eoi);
-    printk("pic 0x%x.\n", s->special_fully_nested_mode);
-    printk("pic 0x%x.\n", s->special_mask_mode);
-    printk("pic 0x%x.\n", s->elcr);
-    printk("pic 0x%x.\n", s->int_output);
-    printk("pic 0x%x.\n", s->is_master);
-}
-#else
-static void vpic_info(struct hvm_hw_vpic *s)
-{
+    return X86EMUL_OKAY;
 }
-#endif
 
 static int vpic_save(struct domain *d, hvm_domain_context_t *h)
 {
@@ -398,7 +368,6 @@ static int vpic_save(struct domain *d, hvm_domain_context_t *h)
     for ( i = 0; i < 2 ; i++ )
     {
         s = &d->arch.hvm_domain.vpic[i];
-        vpic_info(s);
         if ( hvm_save_entry(PIC, i, h, s) )
             return 1;
     }
@@ -421,7 +390,6 @@ static int vpic_load(struct domain *d, hvm_domain_context_t *h)
     if ( hvm_load_entry(PIC, h, s) != 0 )
         return -EINVAL;
 
-    vpic_info(s);
     return 0;
 }
 
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index 9ccbefd22a..b7e50ae8f1 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1244,7 +1244,11 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
  */
 static int __init timer_irq_works(void)
 {
-    unsigned long t1 = jiffies;
+    extern unsigned long pit0_ticks;
+    unsigned long t1;
+
+    t1 = pit0_ticks;
+    mb();
 
     local_irq_enable();
     /* Let ten ticks pass... */
@@ -1257,7 +1261,8 @@ static int __init timer_irq_works(void)
      * might have cached one ExtINT interrupt.  Finally, at
      * least one tick may be lost due to delays.
      */
-    if (jiffies - t1 > 4)
+    mb();
+    if (pit0_ticks - t1 > 4)
         return 1;
 
     return 0;
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index a1220af3b3..15f2cf57eb 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -299,7 +299,7 @@ int memory_is_conventional_ram(paddr_t p)
 unsigned long domain_get_maximum_gpfn(struct domain *d)
 {
     if ( is_hvm_domain(d) )
-        return d->arch.p2m.max_mapped_pfn;
+        return d->arch.p2m->max_mapped_pfn;
     /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
     return arch_get_max_pfn(d) - 1;
 }
@@ -476,7 +476,7 @@ static void invalidate_shadow_ldt(struct vcpu *v)
         if ( pfn == 0 ) continue;
         l1e_write(&v->arch.perdomain_ptes[i], l1e_empty());
         page = mfn_to_page(pfn);
-        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
+        ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
         ASSERT_PAGE_IS_DOMAIN(page, v->domain);
         put_page_and_type(page);
     }
@@ -530,7 +530,7 @@ int map_ldt_shadow_page(unsigned int off)
     if ( unlikely(!mfn_valid(mfn)) )
         return 0;
 
-    okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
+    okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page);
     if ( unlikely(!okay) )
         return 0;
 
@@ -924,7 +924,7 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
     {
         /* We expect this is rare so we blow the entire shadow LDT. */
         if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
-                       PGT_ldt_page)) &&
+                       PGT_seg_desc_page)) &&
              unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
              (d == e) )
         {
@@ -1748,8 +1748,7 @@ static int alloc_page_type(struct page_info *page, unsigned long type)
         return alloc_l3_table(page);
     case PGT_l4_page_table:
         return alloc_l4_table(page);
-    case PGT_gdt_page:
-    case PGT_ldt_page:
+    case PGT_seg_desc_page:
         return alloc_segdesc_page(page);
     default:
         printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", 
@@ -2189,7 +2188,7 @@ int do_mmuext_op(
         goto out;
     }
 
-    LOCK_BIGLOCK(d);
+    domain_lock(d);
 
     for ( i = 0; i < count; i++ )
     {
@@ -2438,7 +2437,7 @@ int do_mmuext_op(
 
     process_deferred_ops();
 
-    UNLOCK_BIGLOCK(d);
+    domain_unlock(d);
 
     perfc_add(num_mmuext_ops, i);
 
@@ -2493,7 +2492,7 @@ int do_mmu_update(
 
     domain_mmap_cache_init(&mapcache);
 
-    LOCK_BIGLOCK(d);
+    domain_lock(d);
 
     for ( i = 0; i < count; i++ )
     {
@@ -2665,7 +2664,7 @@ int do_mmu_update(
 
     process_deferred_ops();
 
-    UNLOCK_BIGLOCK(d);
+    domain_unlock(d);
 
     domain_mmap_cache_destroy(&mapcache);
 
@@ -2694,7 +2693,7 @@ static int create_grant_pte_mapping(
     l1_pgentry_t ol1e;
     struct domain *d = v->domain;
 
-    ASSERT(spin_is_locked(&d->big_lock));
+    ASSERT(domain_is_locked(d));
 
     adjust_guest_l1e(nl1e, d);
 
@@ -2817,7 +2816,7 @@ static int create_grant_va_mapping(
     unsigned long gl1mfn;
     int okay;
     
-    ASSERT(spin_is_locked(&d->big_lock));
+    ASSERT(domain_is_locked(d));
 
     adjust_guest_l1e(nl1e, d);
 
@@ -3015,7 +3014,7 @@ int do_update_va_mapping(unsigned long va, u64 val64,
     if ( rc )
         return rc;
 
-    LOCK_BIGLOCK(d);
+    domain_lock(d);
 
     pl1e = guest_map_l1e(v, va, &gl1mfn);
 
@@ -3028,7 +3027,7 @@ int do_update_va_mapping(unsigned long va, u64 val64,
 
     process_deferred_ops();
 
-    UNLOCK_BIGLOCK(d);
+    domain_unlock(d);
 
     switch ( flags & UVMF_FLUSHTYPE_MASK )
     {
@@ -3134,7 +3133,7 @@ long set_gdt(struct vcpu *v,
     {
         mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
         if ( !mfn_valid(mfn) ||
-             !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) )
+             !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) )
             goto fail;
     }
 
@@ -3173,12 +3172,12 @@ long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries)
     if ( copy_from_guest(frames, frame_list, nr_pages) )
         return -EFAULT;
 
-    LOCK_BIGLOCK(curr->domain);
+    domain_lock(curr->domain);
 
     if ( (ret = set_gdt(curr, frames, entries)) == 0 )
         flush_tlb_local();
 
-    UNLOCK_BIGLOCK(curr->domain);
+    domain_unlock(curr->domain);
 
     return ret;
 }
@@ -3211,12 +3210,8 @@ long do_update_descriptor(u64 pa, u64 desc)
     /* Check if the given frame is in use in an unsafe context. */
     switch ( page->u.inuse.type_info & PGT_type_mask )
     {
-    case PGT_gdt_page:
-        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
-            goto out;
-        break;
-    case PGT_ldt_page:
-        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
+    case PGT_seg_desc_page:
+        if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) )
             goto out;
         break;
     default:
@@ -3316,7 +3311,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
             return -EINVAL;
         }
 
-        LOCK_BIGLOCK(d);
+        domain_lock(d);
 
         /* Remove previously mapped page if it was present. */
         prev_mfn = gmfn_to_mfn(d, xatp.gpfn);
@@ -3338,7 +3333,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         /* Map at new location. */
         guest_physmap_add_page(d, xatp.gpfn, mfn);
 
-        UNLOCK_BIGLOCK(d);
+        domain_unlock(d);
 
         rcu_unlock_domain(d);
 
@@ -3674,7 +3669,7 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
     struct ptwr_emulate_ctxt ptwr_ctxt;
     int rc;
 
-    LOCK_BIGLOCK(d);
+    domain_lock(d);
 
     /* Attempt to read the PTE that maps the VA being accessed. */
     guest_get_eff_l1e(v, addr, &pte);
@@ -3699,12 +3694,12 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
     if ( rc == X86EMUL_UNHANDLEABLE )
         goto bail;
 
-    UNLOCK_BIGLOCK(d);
+    domain_unlock(d);
     perfc_incr(ptwr_emulations);
     return EXCRET_fault_fixed;
 
  bail:
-    UNLOCK_BIGLOCK(d);
+    domain_unlock(d);
     return 0;
 }
 
diff --git a/xen/arch/x86/mm/hap/Makefile b/xen/arch/x86/mm/hap/Makefile
index 160e5f36bf..64cb72786e 100644
--- a/xen/arch/x86/mm/hap/Makefile
+++ b/xen/arch/x86/mm/hap/Makefile
@@ -2,6 +2,7 @@ obj-y += hap.o
 obj-y += guest_walk_2level.o
 obj-y += guest_walk_3level.o
 obj-y += guest_walk_4level.o
+obj-y += p2m-ept.o
 
 guest_levels  = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
 guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 15cdc23c96..e30acf6948 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -38,6 +38,7 @@
 #include <asm/hap.h>
 #include <asm/paging.h>
 #include <asm/domain.h>
+#include <xen/numa.h>
 
 #include "private.h"
 
@@ -61,7 +62,7 @@ int hap_enable_log_dirty(struct domain *d)
     hap_unlock(d);
 
     /* set l1e entries of P2M table to be read-only. */
-    p2m_change_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
+    p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
     flush_tlb_mask(d->domain_dirty_cpumask);
     return 0;
 }
@@ -73,14 +74,14 @@ int hap_disable_log_dirty(struct domain *d)
     hap_unlock(d);
 
     /* set l1e entries of P2M table with normal mode */
-    p2m_change_type_global(d, p2m_ram_logdirty, p2m_ram_rw);
+    p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw);
     return 0;
 }
 
 void hap_clean_dirty_bitmap(struct domain *d)
 {
     /* set l1e entries of P2M table to be read-only. */
-    p2m_change_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
+    p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty);
     flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
@@ -135,7 +136,8 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d)
          && mfn_x(page_to_mfn(pg)) >= (1UL << (32 - PAGE_SHIFT)) )
     {
         free_domheap_page(pg);
-        pg = alloc_domheap_pages(NULL, 0, MEMF_bits(32));
+        pg = alloc_domheap_page(
+            NULL, MEMF_bits(32) | MEMF_node(domain_to_node(d)));
         if ( likely(pg != NULL) )
         {
             void *p = hap_map_domain_page(page_to_mfn(pg));
@@ -199,7 +201,7 @@ hap_set_allocation(struct domain *d, unsigned int pages, int *preempted)
         if ( d->arch.paging.hap.total_pages < pages )
         {
             /* Need to allocate more memory from domheap */
-            pg = alloc_domheap_page(NULL);
+            pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
             if ( pg == NULL )
             {
                 HAP_PRINTK("failed to allocate hap pages.\n");
diff --git a/xen/arch/x86/mm/hap/p2m-ept.c b/xen/arch/x86/mm/hap/p2m-ept.c
new file mode 100644
index 0000000000..697ca4d697
--- /dev/null
+++ b/xen/arch/x86/mm/hap/p2m-ept.c
@@ -0,0 +1,257 @@
+/*
+ * ept-p2m.c: use the EPT page table as p2m
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/config.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/types.h>
+#include <asm/domain.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <xen/iommu.h>
+
+static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type)
+{
+    switch(type)
+    {
+        case p2m_invalid:
+        case p2m_mmio_dm:
+        default:
+            return;
+        case p2m_ram_rw:
+        case p2m_mmio_direct:
+             entry->r = entry->w = entry->x = 1;
+            return;
+        case p2m_ram_logdirty:
+        case p2m_ram_ro:
+             entry->r = entry->x = 1;
+             entry->w = 0;
+            return;
+    }
+}
+
+static int ept_next_level(struct domain *d, bool_t read_only,
+                          ept_entry_t **table, unsigned long *gfn_remainder,
+                          u32 shift)
+{
+    ept_entry_t *ept_entry, *next;
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    *gfn_remainder &= (1UL << shift) - 1;
+
+    ept_entry = (*table) + index;
+
+    if ( !(ept_entry->epte & 0x7) )
+    {
+        struct page_info *pg;
+
+        if ( read_only )
+            return 0;
+
+        pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+
+        pg->count_info = 1;
+        pg->u.inuse.type_info = 1 | PGT_validated;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+
+        ept_entry->emt = 0;
+        ept_entry->sp_avail = 0;
+        ept_entry->avail1 = 0;
+        ept_entry->mfn = page_to_mfn(pg);
+        ept_entry->rsvd = 0;
+        ept_entry->avail2 = 0;
+        /* last step */
+        ept_entry->r = ept_entry->w = ept_entry->x = 1;
+    }
+
+    next = map_domain_page(ept_entry->mfn);
+    unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+static int
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+{
+    ept_entry_t *table =
+        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry = NULL;
+    u32 index;
+    int i, rv = 0;
+
+    /* Should check if gfn obeys GAW here */
+
+    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER) )
+            goto out;
+
+    index = gfn_remainder;
+    ept_entry = table + index;
+
+    if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
+    {
+        /* Track the highest gfn for which we have ever had a valid mapping */
+        if ( gfn > d->arch.p2m->max_mapped_pfn )
+            d->arch.p2m->max_mapped_pfn = gfn;
+
+        ept_entry->emt = EPT_DEFAULT_MT;
+        ept_entry->sp_avail = 0;
+        ept_entry->avail1 = p2mt;
+        ept_entry->mfn = mfn_x(mfn);
+        ept_entry->rsvd = 0;
+        ept_entry->avail2 = 0;
+        /* last step */
+        ept_entry->r = ept_entry->w = ept_entry->x = 1;
+        ept_p2m_type_to_flags(ept_entry, p2mt);
+    }
+    else
+        ept_entry->epte = 0;
+
+    /* Success */
+    rv = 1;
+
+ out:
+    unmap_domain_page(table);
+
+    ept_sync_domain(d);
+
+    /* If p2m table is shared with vtd page-table. */
+    if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
+        iommu_flush(d, gfn, (u64*)ept_entry);
+
+    return rv;
+}
+
+/* Read ept p2m entries */
+static mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    ept_entry_t *table =
+        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry;
+    u32 index;
+    int i;
+    mfn_t mfn = _mfn(INVALID_MFN);
+
+    *t = p2m_mmio_dm;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > d->arch.p2m->max_mapped_pfn )
+        goto out;
+
+    /* Should check if gfn obeys GAW here. */
+
+    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER) )
+            goto out;
+
+    index = gfn_remainder;
+    ept_entry = table + index;
+
+    if ( ept_entry->avail1 != p2m_invalid )
+    {
+        *t = ept_entry->avail1;
+        mfn = _mfn(ept_entry->mfn);
+    }
+
+ out:
+    unmap_domain_page(table);
+    return mfn;
+}
+
+static mfn_t ept_get_entry_current(unsigned long gfn, p2m_type_t *t)
+{
+    return ept_get_entry(current->domain, gfn, t);
+}
+
+/* Walk the whole p2m table, changing any entries of the old type
+ * to the new type.  This is used in hardware-assisted paging to
+ * quickly enable or diable log-dirty tracking */
+
+static void ept_change_entry_type_global(struct domain *d,
+                                            p2m_type_t ot, p2m_type_t nt)
+{
+    ept_entry_t *l4e, *l3e, *l2e, *l1e;
+    int i4, i3, i2, i1;
+
+    if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
+        return;
+
+    BUG_ON(EPT_DEFAULT_GAW != 3);
+
+    l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
+    {
+        if ( !l4e[i4].epte || l4e[i4].sp_avail )
+            continue;
+        l3e = map_domain_page(l4e[i4].mfn);
+        for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+        {
+            if ( !l3e[i3].epte || l3e[i3].sp_avail )
+                continue;
+            l2e = map_domain_page(l3e[i3].mfn);
+            for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+            {
+                if ( !l2e[i2].epte || l2e[i2].sp_avail )
+                    continue;
+                l1e = map_domain_page(l2e[i2].mfn);
+                for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                {
+                    if ( !l1e[i1].epte )
+                        continue;
+                    if ( l1e[i1].avail1 != ot )
+                        continue;
+                    l1e[i1].avail1 = nt;
+                    ept_p2m_type_to_flags(l1e+i1, nt);
+                }
+                unmap_domain_page(l1e);
+            }
+            unmap_domain_page(l2e);
+        }
+        unmap_domain_page(l3e);
+    }
+    unmap_domain_page(l4e);
+
+    ept_sync_domain(d);
+}
+
+void ept_p2m_init(struct domain *d)
+{
+    d->arch.p2m->set_entry = ept_set_entry;
+    d->arch.p2m->get_entry = ept_get_entry;
+    d->arch.p2m->get_entry_current = ept_get_entry_current;
+    d->arch.p2m->change_entry_type_global = ept_change_entry_type_global;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index e8298fb3bd..faee13955e 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -27,6 +27,7 @@
 #include <asm/page.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
 #include <xen/iommu.h>
 
 /* Debugging and auditing of the P2M code? */
@@ -41,36 +42,37 @@
  * Locking discipline: always acquire this lock before the shadow or HAP one
  */
 
-#define p2m_lock_init(_d)                            \
-    do {                                             \
-        spin_lock_init(&(_d)->arch.p2m.lock);        \
-        (_d)->arch.p2m.locker = -1;                  \
-        (_d)->arch.p2m.locker_function = "nobody";   \
+#define p2m_lock_init(_p2m)                     \
+    do {                                        \
+        spin_lock_init(&(_p2m)->lock);          \
+        (_p2m)->locker = -1;                    \
+        (_p2m)->locker_function = "nobody";     \
     } while (0)
 
-#define p2m_lock(_d)                                                \
-    do {                                                            \
-        if ( unlikely((_d)->arch.p2m.locker == current->processor) )\
-        {                                                           \
-            printk("Error: p2m lock held by %s\n",                  \
-                   (_d)->arch.p2m.locker_function);                 \
-            BUG();                                                  \
-        }                                                           \
-        spin_lock(&(_d)->arch.p2m.lock);                            \
-        ASSERT((_d)->arch.p2m.locker == -1);                        \
-        (_d)->arch.p2m.locker = current->processor;                 \
-        (_d)->arch.p2m.locker_function = __func__;                  \
+#define p2m_lock(_p2m)                                          \
+    do {                                                        \
+        if ( unlikely((_p2m)->locker == current->processor) )   \
+        {                                                       \
+            printk("Error: p2m lock held by %s\n",              \
+                   (_p2m)->locker_function);                    \
+            BUG();                                              \
+        }                                                       \
+        spin_lock(&(_p2m)->lock);                               \
+        ASSERT((_p2m)->locker == -1);                           \
+        (_p2m)->locker = current->processor;                    \
+        (_p2m)->locker_function = __func__;                     \
     } while (0)
 
-#define p2m_unlock(_d)                                              \
-    do {                                                            \
-        ASSERT((_d)->arch.p2m.locker == current->processor); \
-        (_d)->arch.p2m.locker = -1;                          \
-        (_d)->arch.p2m.locker_function = "nobody";           \
-        spin_unlock(&(_d)->arch.p2m.lock);                   \
+#define p2m_unlock(_p2m)                                \
+    do {                                                \
+        ASSERT((_p2m)->locker == current->processor);   \
+        (_p2m)->locker = -1;                            \
+        (_p2m)->locker_function = "nobody";             \
+        spin_unlock(&(_p2m)->lock);                     \
     } while (0)
 
-
+#define p2m_locked_by_me(_p2m)                            \
+    (current->processor == (_p2m)->locker)
 
 /* Printouts */
 #define P2M_PRINTK(_f, _a...)                                \
@@ -152,7 +154,7 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
-    ASSERT(d->arch.p2m.alloc_page);
+    ASSERT(d->arch.p2m->alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
                                       shift, max)) )
@@ -160,10 +162,10 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
 
     if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
     {
-        struct page_info *pg = d->arch.p2m.alloc_page(d);
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
         if ( pg == NULL )
             return 0;
-        list_add_tail(&pg->list, &d->arch.p2m.pages);
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
         pg->u.inuse.type_info = type | 1 | PGT_validated;
         pg->count_info = 1;
 
@@ -202,7 +204,7 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
 
 // Returns 0 on error (out of memory)
 static int
-set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -244,8 +246,8 @@ set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
     ASSERT(p2m_entry);
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) )
-        d->arch.p2m.max_mapped_pfn = gfn;
+    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
+        d->arch.p2m->max_mapped_pfn = gfn;
 
     if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
         entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
@@ -279,14 +281,170 @@ set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
     return rv;
 }
 
+static mfn_t
+p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    mfn_t mfn;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+
+    ASSERT(paging_mode_translate(d));
+
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+    *t = p2m_mmio_dm;
+
+    mfn = pagetable_get_mfn(d->arch.phys_table);
+
+    if ( gfn > d->arch.p2m->max_mapped_pfn )
+        /* This pfn is higher than the highest the p2m map currently holds */
+        return _mfn(INVALID_MFN);
+
+#if CONFIG_PAGING_LEVELS >= 4
+    {
+        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        unmap_domain_page(l4e);
+    }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    {
+        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
+#if CONFIG_PAGING_LEVELS == 3
+        /* On PAE hosts the p2m has eight l3 entries, not four (see
+         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
+         * Instead, just count the number of l3es from zero.  It's safe
+         * to do this because we already checked that the gfn is within
+         * the bounds of the p2m. */
+        l3e += (addr >> L3_PAGETABLE_SHIFT);
+#else
+        l3e += l3_table_offset(addr);
+#endif
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        unmap_domain_page(l3e);
+    }
+#endif
+
+    l2e = map_domain_page(mfn_x(mfn));
+    l2e += l2_table_offset(addr);
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    unmap_domain_page(l2e);
+
+    l1e = map_domain_page(mfn_x(mfn));
+    l1e += l1_table_offset(addr);
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
+    unmap_domain_page(l1e);
+
+    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+    return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+}
+
+/* Read the current domain's p2m table (through the linear mapping). */
+static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
+{
+    mfn_t mfn = _mfn(INVALID_MFN);
+    p2m_type_t p2mt = p2m_mmio_dm;
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+
+    if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
+    {
+        l1_pgentry_t l1e = l1e_empty();
+        int ret;
+
+        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
+               / sizeof(l1_pgentry_t));
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+        ret = __copy_from_user(&l1e,
+                               &phys_to_machine_mapping[gfn],
+                               sizeof(l1e));
+
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else 
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
+        }
+    }
+
+    *t = p2mt;
+    return mfn;
+}
 
 /* Init the datastructures for later use by the p2m code */
-void p2m_init(struct domain *d)
+int p2m_init(struct domain *d)
 {
-    p2m_lock_init(d);
-    INIT_LIST_HEAD(&d->arch.p2m.pages);
+    struct p2m_domain *p2m;
+
+    p2m = xmalloc(struct p2m_domain);
+    if ( p2m == NULL )
+        return -ENOMEM;
+
+    d->arch.p2m = p2m;
+
+    memset(p2m, 0, sizeof(*p2m));
+    p2m_lock_init(p2m);
+    INIT_LIST_HEAD(&p2m->pages);
+
+    p2m->set_entry = p2m_set_entry;
+    p2m->get_entry = p2m_gfn_to_mfn;
+    p2m->get_entry_current = p2m_gfn_to_mfn_current;
+    p2m->change_entry_type_global = p2m_change_type_global;
+
+    if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+        ept_p2m_init(d);
+
+    return 0;
 }
 
+void p2m_change_entry_type_global(struct domain *d,
+                                  p2m_type_t ot, p2m_type_t nt)
+{
+    struct p2m_domain *p2m = d->arch.p2m;
+
+    p2m_lock(p2m);
+    p2m->change_entry_type_global(d, ot, nt);
+    p2m_unlock(p2m);
+}
+
+static inline
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+}
 
 // Allocate a new p2m table for a domain.
 //
@@ -308,28 +466,29 @@ int p2m_alloc_table(struct domain *d,
     struct page_info *page, *p2m_top;
     unsigned int page_count = 0;
     unsigned long gfn = -1UL;
+    struct p2m_domain *p2m = d->arch.p2m;
 
-    p2m_lock(d);
+    p2m_lock(p2m);
 
     if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
     {
         P2M_ERROR("p2m already allocated for this domain\n");
-        p2m_unlock(d);
+        p2m_unlock(p2m);
         return -EINVAL;
     }
 
     P2M_PRINTK("allocating p2m table\n");
 
-    d->arch.p2m.alloc_page = alloc_page;
-    d->arch.p2m.free_page = free_page;
+    p2m->alloc_page = alloc_page;
+    p2m->free_page = free_page;
 
-    p2m_top = d->arch.p2m.alloc_page(d);
+    p2m_top = p2m->alloc_page(d);
     if ( p2m_top == NULL )
     {
-        p2m_unlock(d);
+        p2m_unlock(p2m);
         return -ENOMEM;
     }
-    list_add_tail(&p2m_top->list, &d->arch.p2m.pages);
+    list_add_tail(&p2m_top->list, &p2m->pages);
 
     p2m_top->count_info = 1;
     p2m_top->u.inuse.type_info =
@@ -376,13 +535,13 @@ int p2m_alloc_table(struct domain *d,
 #endif
 
     P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
-    p2m_unlock(d);
+    p2m_unlock(p2m);
     return 0;
 
  error:
     P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
                PRI_mfn "\n", gfn, mfn_x(mfn));
-    p2m_unlock(d);
+    p2m_unlock(p2m);
     return -ENOMEM;
 }
 
@@ -392,101 +551,24 @@ void p2m_teardown(struct domain *d)
 {
     struct list_head *entry, *n;
     struct page_info *pg;
+    struct p2m_domain *p2m = d->arch.p2m;
 
-    p2m_lock(d);
+    p2m_lock(p2m);
     d->arch.phys_table = pagetable_null();
 
-    list_for_each_safe(entry, n, &d->arch.p2m.pages)
+    list_for_each_safe(entry, n, &p2m->pages)
     {
         pg = list_entry(entry, struct page_info, list);
         list_del(entry);
-        d->arch.p2m.free_page(d, pg);
+        p2m->free_page(d, pg);
     }
-    p2m_unlock(d);
+    p2m_unlock(p2m);
 }
 
-mfn_t
-gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
-/* Read another domain's p2m entries */
+void p2m_final_teardown(struct domain *d)
 {
-    mfn_t mfn;
-    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
-    l2_pgentry_t *l2e;
-    l1_pgentry_t *l1e;
-
-    ASSERT(paging_mode_translate(d));
-
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-    *t = p2m_mmio_dm;
-
-    mfn = pagetable_get_mfn(d->arch.phys_table);
-
-    if ( gfn > d->arch.p2m.max_mapped_pfn )
-        /* This pfn is higher than the highest the p2m map currently holds */
-        return _mfn(INVALID_MFN);
-
-#if CONFIG_PAGING_LEVELS >= 4
-    {
-        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
-        l4e += l4_table_offset(addr);
-        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l4e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l4e_get_pfn(*l4e));
-        unmap_domain_page(l4e);
-    }
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
-    {
-        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
-#if CONFIG_PAGING_LEVELS == 3
-        /* On PAE hosts the p2m has eight l3 entries, not four (see
-         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
-         * Instead, just count the number of l3es from zero.  It's safe
-         * to do this because we already checked that the gfn is within
-         * the bounds of the p2m. */
-        l3e += (addr >> L3_PAGETABLE_SHIFT);
-#else
-        l3e += l3_table_offset(addr);
-#endif
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l3e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l3e_get_pfn(*l3e));
-        unmap_domain_page(l3e);
-    }
-#endif
-
-    l2e = map_domain_page(mfn_x(mfn));
-    l2e += l2_table_offset(addr);
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
-    {
-        unmap_domain_page(l2e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l2e_get_pfn(*l2e));
-    unmap_domain_page(l2e);
-
-    l1e = map_domain_page(mfn_x(mfn));
-    l1e += l1_table_offset(addr);
-    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
-    {
-        unmap_domain_page(l1e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
-    unmap_domain_page(l1e);
-
-    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-    return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    xfree(d->arch.p2m);
+    d->arch.p2m = NULL;
 }
 
 #if P2M_AUDIT
@@ -564,7 +646,7 @@ static void audit_p2m(struct domain *d)
             set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
         }
 
-        if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) )
+        if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) )
         {
             lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type));
             if ( lp2mfn != mfn_x(p2mfn) )
@@ -695,11 +777,11 @@ void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
                           unsigned long mfn)
 {
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
     audit_p2m(d);
     p2m_remove_page(d, gfn, mfn);
     audit_p2m(d);
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
 }
 
 int
@@ -722,7 +804,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
      */
     if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
     {
-        if ( !test_and_set_bool(d->arch.hvm_domain.amd_npt_4gb_warning) )
+        if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
             dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
                     " 4GB: specify 'hap=0' domain config option.\n",
                     d->domain_id);
@@ -730,7 +812,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
     }
 #endif
 
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
     audit_p2m(d);
 
     P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
@@ -781,7 +863,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
     }
 
     audit_p2m(d);
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
 
     return rc;
 }
@@ -812,7 +894,7 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt)
     if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
         return;
 
-    p2m_lock(d);
+    ASSERT(p2m_locked_by_me(d->arch.p2m));
 
 #if CONFIG_PAGING_LEVELS == 4
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
@@ -860,7 +942,7 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt)
                     mfn = l1e_get_pfn(l1e[i1]);
                     gfn = get_gpfn_from_mfn(mfn);
                     /* create a new 1le entry with the new type */
-                    flags = p2m_flags_to_type(nt);
+                    flags = p2m_type_to_flags(nt);
                     l1e_content = l1e_from_pfn(mfn, flags);
                     paging_write_p2m_entry(d, gfn, &l1e[i1],
                                            l1mfn, l1e_content, 1);
@@ -884,7 +966,6 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt)
     unmap_domain_page(l2e);
 #endif
 
-    p2m_unlock(d);
 }
 
 /* Modify the p2m type of a single gfn from ot to nt, returning the 
@@ -895,13 +976,13 @@ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
     p2m_type_t pt;
     mfn_t mfn;
 
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
         set_p2m_entry(d, gfn, mfn, nt);
 
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
 
     return pt;
 }
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index e6c3cbb9e6..2247d8dd68 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -26,6 +26,7 @@
 #include <asm/p2m.h>
 #include <asm/hap.h>
 #include <asm/guest_access.h>
+#include <xen/numa.h>
 #include <xsm/xsm.h>
 
 #define hap_enabled(d) (is_hvm_domain(d) && (d)->arch.hvm_domain.hap_enabled)
@@ -99,8 +100,9 @@
 static mfn_t paging_new_log_dirty_page(struct domain *d, void **mapping_p)
 {
     mfn_t mfn;
-    struct page_info *page = alloc_domheap_page(NULL);
+    struct page_info *page;
 
+    page = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d)));
     if ( unlikely(page == NULL) )
     {
         d->arch.paging.log_dirty.failed_allocs++;
@@ -482,9 +484,12 @@ void paging_log_dirty_teardown(struct domain*d)
 /*           CODE FOR PAGING SUPPORT            */
 /************************************************/
 /* Domain paging struct initialization. */
-void paging_domain_init(struct domain *d)
+int paging_domain_init(struct domain *d)
 {
-    p2m_init(d);
+    int rc;
+
+    if ( (rc = p2m_init(d)) != 0 )
+        return rc;
 
     /* The order of the *_init calls below is important, as the later
      * ones may rewrite some common fields.  Shadow pagetables are the
@@ -494,6 +499,8 @@ void paging_domain_init(struct domain *d)
     /* ... but we will use hardware assistance if it's available. */
     if ( hap_enabled(d) )
         hap_domain_init(d);
+
+    return 0;
 }
 
 /* vcpu paging struct initialization goes here */
@@ -587,6 +594,8 @@ void paging_final_teardown(struct domain *d)
         hap_final_teardown(d);
     else
         shadow_final_teardown(d);
+
+    p2m_final_teardown(d);
 }
 
 /* Enable an arbitrary paging-assistance mode.  Call once at domain
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index e4a04bb456..d7239cde77 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -36,6 +36,7 @@
 #include <asm/current.h>
 #include <asm/flushtlb.h>
 #include <asm/shadow.h>
+#include <xen/numa.h>
 #include "private.h"
 
 
@@ -1249,7 +1250,7 @@ static unsigned int sh_set_allocation(struct domain *d,
         {
             /* Need to allocate more memory from domheap */
             sp = (struct shadow_page_info *)
-                alloc_domheap_pages(NULL, order, 0);
+                alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
             if ( sp == NULL ) 
             { 
                 SHADOW_PRINTK("failed to allocate shadow pages.\n");
@@ -2171,13 +2172,12 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
 #undef DO_UNSHADOW
 
     /* If that didn't catch the shadows, something is wrong */
-    if ( !fast && (pg->count_info & PGC_page_table) )
+    if ( !fast && all && (pg->count_info & PGC_page_table) )
     {
         SHADOW_ERROR("can't find all shadows of mfn %05lx "
                      "(shadow_flags=%08lx)\n",
                       mfn_x(gmfn), pg->shadow_flags);
-        if ( all ) 
-            domain_crash(v->domain);
+        domain_crash(v->domain);
     }
 
     /* Need to flush TLBs now, so that linear maps are safe next time we 
diff --git a/xen/arch/x86/pci.c b/xen/arch/x86/pci.c
new file mode 100644
index 0000000000..341457b4bc
--- /dev/null
+++ b/xen/arch/x86/pci.c
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * pci.c
+ * 
+ * PCI access functions.
+ */
+
+#include <xen/config.h>
+#include <xen/pci.h>
+#include <xen/spinlock.h>
+#include <asm/io.h>
+
+#define PCI_CONF_ADDRESS(bus, dev, func, reg) \
+    (0x80000000 | (bus << 16) | (dev << 11) | (func << 8) | (reg & ~3))
+
+static DEFINE_SPINLOCK(pci_config_lock);
+
+uint32_t pci_conf_read(uint32_t cf8, uint8_t offset, uint8_t bytes)
+{
+    unsigned long flags;
+    uint32_t value;
+
+    BUG_ON((offset + bytes) > 4);
+
+    spin_lock_irqsave(&pci_config_lock, flags);
+
+    outl(cf8, 0xcf8);
+
+    switch ( bytes )
+    {
+    case 1:
+        value = inb(0xcfc + offset);
+        break;
+    case 2:
+        value = inw(0xcfc + offset);
+        break;
+    case 4:
+        value = inl(0xcfc + offset);
+        break;
+    default:
+        value = 0;
+        BUG();
+    }
+
+    spin_unlock_irqrestore(&pci_config_lock, flags);
+
+    return value;
+}
+
+void pci_conf_write(uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data)
+{
+    unsigned long flags;
+
+    BUG_ON((offset + bytes) > 4);
+
+    spin_lock_irqsave(&pci_config_lock, flags);
+
+    outl(cf8, 0xcf8);
+
+    switch ( bytes )
+    {
+    case 1:
+        outb((uint8_t)data, 0xcfc + offset);
+        break;
+    case 2:
+        outw((uint16_t)data, 0xcfc + offset);
+        break;
+    case 4:
+        outl(data, 0xcfc + offset);
+        break;
+    }
+
+    spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+uint8_t pci_conf_read8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1);
+}
+
+uint16_t pci_conf_read16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2);
+}
+
+uint32_t pci_conf_read32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4);
+}
+
+void pci_conf_write8(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint8_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1, data);
+}
+
+void pci_conf_write16(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint16_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2, data);
+}
+
+void pci_conf_write32(
+    unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg,
+    uint32_t data)
+{
+    BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
+    pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data);
+}
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 383a868225..9b025b51b1 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -861,6 +861,8 @@ void __init __start_xen(unsigned long mbi_p)
 
     early_boot = 0;
 
+    softirq_init();
+
     early_cpu_init();
 
     paging_init();
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 57135940bf..ccefc50cf2 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -40,7 +40,7 @@ string_param("clocksource", opt_clocksource);
 unsigned long cpu_khz;  /* CPU clock frequency in kHz. */
 unsigned long hpet_address;
 DEFINE_SPINLOCK(rtc_lock);
-volatile unsigned long jiffies;
+unsigned long pit0_ticks;
 static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
 static DEFINE_SPINLOCK(wc_lock);
 
@@ -67,19 +67,16 @@ struct platform_timesource {
 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
 
 /*
- * Protected by platform_timer_lock, which must be acquired with interrupts
- * disabled because plt_overflow() is called from PIT ch0 interrupt context.
- */
-static s_time_t stime_platform_stamp;
-static u64 platform_timer_stamp;
-static DEFINE_SPINLOCK(platform_timer_lock);
-
-/*
- * Folding platform timer into 64-bit software counter is a really critical
- * operation! We therefore do it directly in PIT ch0 interrupt handler.
+ * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
+ * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
+ * softirq handling will happen in time.
+ * 
+ * The pit_lock protects the 16- and 32-bit stamp fields as well as the 
  */
-static u32 plt_overflow_jiffies;
-static void plt_overflow(void);
+static DEFINE_SPINLOCK(pit_lock);
+static u16 pit_stamp16;
+static u32 pit_stamp32;
+static int using_pit;
 
 /*
  * 32-bit division of integer dividend and integer divisor yielding
@@ -146,22 +143,36 @@ static inline u64 scale_delta(u64 delta, struct time_scale *scale)
     return product;
 }
 
-void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
+static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
 {
     ASSERT(local_irq_is_enabled());
 
-    /* Update jiffies counter. */
-    (*(volatile unsigned long *)&jiffies)++;
+    /* Only for start-of-day interruopt tests in io_apic.c. */
+    (*(volatile unsigned long *)&pit0_ticks)++;
 
     /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
     if ( !cpu_has_apic )
         raise_softirq(TIMER_SOFTIRQ);
 
-    if ( --plt_overflow_jiffies == 0 )
-        plt_overflow();
+    /* Emulate a 32-bit PIT counter. */
+    if ( using_pit )
+    {
+        u16 count;
+
+        spin_lock_irq(&pit_lock);
+
+        outb(0x80, PIT_MODE);
+        count  = inb(PIT_CH2);
+        count |= inb(PIT_CH2) << 8;
+
+        pit_stamp32 += (u16)(pit_stamp16 - count);
+        pit_stamp16 = count;
+
+        spin_unlock_irq(&pit_lock);
+    }
 }
 
-static struct irqaction irq0 = { timer_interrupt, "timer", NULL};
+static struct irqaction irq0 = { timer_interrupt, "timer", NULL };
 
 /* ------ Calibrate the TSC ------- 
  * Return processor ticks per second / CALIBRATE_FRAC.
@@ -295,12 +306,21 @@ static char *freq_string(u64 freq)
 
 static u32 read_pit_count(void)
 {
-    u16 count;
-    ASSERT(spin_is_locked(&platform_timer_lock));
+    u16 count16;
+    u32 count32;
+    unsigned long flags;
+
+    spin_lock_irqsave(&pit_lock, flags);
+
     outb(0x80, PIT_MODE);
-    count  = inb(PIT_CH2);
-    count |= inb(PIT_CH2) << 8;
-    return ~count;
+    count16  = inb(PIT_CH2);
+    count16 |= inb(PIT_CH2) << 8;
+
+    count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
+
+    spin_unlock_irqrestore(&pit_lock, flags);
+
+    return count32;
 }
 
 static void init_pit(struct platform_timesource *pts)
@@ -308,7 +328,8 @@ static void init_pit(struct platform_timesource *pts)
     pts->name = "PIT";
     pts->frequency = CLOCK_TICK_RATE;
     pts->read_counter = read_pit_count;
-    pts->counter_bits = 16;
+    pts->counter_bits = 32;
+    using_pit = 1;
 }
 
 /************************************************************
@@ -466,24 +487,28 @@ static int init_pmtimer(struct platform_timesource *pts)
 
 static struct platform_timesource plt_src; /* details of chosen timesource  */
 static u32 plt_mask;             /* hardware-width mask                     */
-static u32 plt_overflow_period;  /* jiffies between calls to plt_overflow() */
+static u64 plt_overflow_period;  /* ns between calls to plt_overflow()      */
 static struct time_scale plt_scale; /* scale: platform counter -> nanosecs  */
 
 /* Protected by platform_timer_lock. */
-static u64 plt_count64;          /* 64-bit platform counter stamp           */
-static u32 plt_count;            /* hardware-width platform counter stamp   */
+static DEFINE_SPINLOCK(platform_timer_lock);
+static s_time_t stime_platform_stamp; /* System time at below platform time */
+static u64 platform_timer_stamp;      /* Platform time at above system time */
+static u64 plt_stamp64;          /* 64-bit platform counter stamp           */
+static u32 plt_stamp;            /* hardware-width platform counter stamp   */
+static struct timer plt_overflow_timer;
 
-static void plt_overflow(void)
+static void plt_overflow(void *unused)
 {
     u32 count;
-    unsigned long flags;
 
-    spin_lock_irqsave(&platform_timer_lock, flags);
+    spin_lock(&platform_timer_lock);
     count = plt_src.read_counter();
-    plt_count64 += (count - plt_count) & plt_mask;
-    plt_count = count;
-    plt_overflow_jiffies = plt_overflow_period;
-    spin_unlock_irqrestore(&platform_timer_lock, flags);
+    plt_stamp64 += (count - plt_stamp) & plt_mask;
+    plt_stamp = count;
+    spin_unlock(&platform_timer_lock);
+
+    set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
 }
 
 static s_time_t __read_platform_stime(u64 platform_time)
@@ -497,12 +522,11 @@ static s_time_t read_platform_stime(void)
 {
     u64 count;
     s_time_t stime;
-    unsigned long flags;
 
-    spin_lock_irqsave(&platform_timer_lock, flags);
-    count = plt_count64 + ((plt_src.read_counter() - plt_count) & plt_mask);
+    spin_lock(&platform_timer_lock);
+    count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
     stime = __read_platform_stime(count);
-    spin_unlock_irqrestore(&platform_timer_lock, flags);
+    spin_unlock(&platform_timer_lock);
 
     return stime;
 }
@@ -511,27 +535,25 @@ static void platform_time_calibration(void)
 {
     u64 count;
     s_time_t stamp;
-    unsigned long flags;
 
-    spin_lock_irqsave(&platform_timer_lock, flags);
-    count = plt_count64 + ((plt_src.read_counter() - plt_count) & plt_mask);
+    spin_lock(&platform_timer_lock);
+    count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
     stamp = __read_platform_stime(count);
     stime_platform_stamp = stamp;
     platform_timer_stamp = count;
-    spin_unlock_irqrestore(&platform_timer_lock, flags);
+    spin_unlock(&platform_timer_lock);
 }
 
 static void resume_platform_timer(void)
 {
     /* No change in platform_stime across suspend/resume. */
-    platform_timer_stamp = plt_count64;
-    plt_count = plt_src.read_counter();
+    platform_timer_stamp = plt_stamp64;
+    plt_stamp = plt_src.read_counter();
 }
 
 static void init_platform_timer(void)
 {
     struct platform_timesource *pts = &plt_src;
-    u64 overflow_period;
     int rc = -1;
 
     if ( opt_clocksource[0] != '\0' )
@@ -561,13 +583,12 @@ static void init_platform_timer(void)
 
     set_time_scale(&plt_scale, pts->frequency);
 
-    overflow_period = scale_delta(1ull << (pts->counter_bits-1), &plt_scale);
-    do_div(overflow_period, MILLISECS(1000/HZ));
-    plt_overflow_period = overflow_period;
-    plt_overflow();
-    printk("Platform timer overflows in %d jiffies.\n", plt_overflow_period);
+    plt_overflow_period = scale_delta(
+        1ull << (pts->counter_bits-1), &plt_scale);
+    init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
+    plt_overflow(NULL);
 
-    platform_timer_stamp = plt_count64;
+    platform_timer_stamp = plt_stamp64;
 
     printk("Platform timer is %s %s\n",
            freq_string(pts->frequency), pts->name);
@@ -969,6 +990,19 @@ void __init early_time_init(void)
     setup_irq(0, &irq0);
 }
 
+static int __init disable_pit_irq(void)
+{
+    if ( !using_pit && cpu_has_apic )
+    {
+        /* Disable PIT CH0 timer interrupt. */
+        outb_p(0x30, PIT_MODE);
+        outb_p(0, PIT_CH0);
+        outb_p(0, PIT_CH0);
+    }
+    return 0;
+}
+__initcall(disable_pit_irq);
+
 void send_timer_event(struct vcpu *v)
 {
     send_guest_vcpu_virq(v, VIRQ_TIMER);
@@ -1002,6 +1036,8 @@ int time_resume(void)
 {
     u64 tmp = init_pit_and_calibrate_tsc();
 
+    disable_pit_irq();
+
     set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);
 
     resume_platform_timer();
@@ -1019,7 +1055,7 @@ int time_resume(void)
 int dom0_pit_access(struct ioreq *ioreq)
 {
     /* Is Xen using Channel 2? Then disallow direct dom0 access. */
-    if ( plt_src.read_counter == read_pit_count )
+    if ( using_pit )
         return 0;
 
     switch ( ioreq->addr )
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 019e3e56cf..5e39c9b417 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1305,23 +1305,24 @@ static int read_gate_descriptor(unsigned int gate_sel,
     const struct desc_struct *pdesc;
 
 
-    pdesc = (const struct desc_struct *)(!(gate_sel & 4) ?
-                                         GDT_VIRT_START(v) :
-                                         LDT_VIRT_START(v))
-            + (gate_sel >> 3);
-    if ( gate_sel < 4 ||
-         (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) ||
+    pdesc = (const struct desc_struct *)
+        (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v))
+        + (gate_sel >> 3);
+    if ( (gate_sel < 4) ||
+         ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
          __get_user(desc, pdesc) )
         return 0;
 
     *sel = (desc.a >> 16) & 0x0000fffc;
     *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000);
     *ar = desc.b & 0x0000ffff;
+
     /*
      * check_descriptor() clears the DPL field and stores the
      * guest requested DPL in the selector's RPL field.
      */
-    ASSERT(!(*ar & _SEGMENT_DPL));
+    if ( *ar & _SEGMENT_DPL )
+        return 0;
     *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL;
 
     if ( !is_pv_32bit_vcpu(v) )
@@ -1352,7 +1353,7 @@ static int read_gate_descriptor(unsigned int gate_sel,
 #endif
 
 /* Has the guest requested sufficient permission for this I/O access? */
-static inline int guest_io_okay(
+static int guest_io_okay(
     unsigned int port, unsigned int bytes,
     struct vcpu *v, struct cpu_user_regs *regs)
 {
@@ -1394,19 +1395,130 @@ static inline int guest_io_okay(
 }
 
 /* Has the administrator granted sufficient permission for this I/O access? */
-static inline int admin_io_okay(
+static int admin_io_okay(
     unsigned int port, unsigned int bytes,
     struct vcpu *v, struct cpu_user_regs *regs)
 {
     return ioports_access_permitted(v->domain, port, port + bytes - 1);
 }
 
-#define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
-#define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
-#define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
-#define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r)
-#define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r)
-#define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r)
+static uint32_t guest_io_read(
+    unsigned int port, unsigned int bytes,
+    struct vcpu *v, struct cpu_user_regs *regs)
+{
+    extern uint32_t pci_conf_read(
+        uint32_t cf8, uint8_t offset, uint8_t bytes);
+
+    uint32_t data = 0;
+    unsigned int shift = 0;
+
+    if ( admin_io_okay(port, bytes, v, regs) )
+    {
+        switch ( bytes )
+        {
+        case 1: return inb(port);
+        case 2: return inw(port);
+        case 4: return inl(port);
+        }
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+        uint32_t sub_data = 0xff;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            sub_data = pv_pit_handler(port, 0, 0);
+        }
+        else if ( (port & 0xfffc) == 0xcf8 )
+        {
+            size = min(bytes, 4 - (port & 3));
+            sub_data = v->domain->arch.pci_cf8 >> ((port & 3) * 8);
+        }
+        else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size);
+        }
+
+        if ( size == 4 )
+            return sub_data;
+
+        data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
+        shift += size * 8;
+        port += size;
+        bytes -= size;
+    }
+
+    return data;
+}
+
+static void guest_io_write(
+    unsigned int port, unsigned int bytes, uint32_t data,
+    struct vcpu *v, struct cpu_user_regs *regs)
+{
+    extern void pci_conf_write(
+        uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data);
+
+    if ( admin_io_okay(port, bytes, v, regs) )
+    {
+        switch ( bytes ) {
+        case 1:
+            outb((uint8_t)data, port);
+            if ( pv_post_outb_hook )
+                pv_post_outb_hook(port, (uint8_t)data);
+            break;
+        case 2:
+            outw((uint16_t)data, port);
+            break;
+        case 4:
+            outl(data, port);
+            break;
+        }
+        return;
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            pv_pit_handler(port, (uint8_t)data, 1);
+        }
+        else if ( (port & 0xfffc) == 0xcf8 )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 4 )
+            {
+                v->domain->arch.pci_cf8 = data;
+            }
+            else
+            {
+                uint32_t mask = ((1u << (size * 8)) - 1) << ((port & 3) * 8);
+                v->domain->arch.pci_cf8 &= ~mask;
+                v->domain->arch.pci_cf8 |= (data << ((port & 3) * 8)) & mask;
+            }
+        }
+        else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data);
+        }
+
+        if ( size == 4 )
+            return;
+
+        port += size;
+        bytes -= size;
+        data >>= size * 8;
+    }
+}
 
 /* I/O emulation support. Helper routines for, and type of, the stack stub.*/
 void host_to_guest_gpr_switch(struct cpu_user_regs *)
@@ -1525,7 +1637,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
 
     /* REX prefix. */
     if ( rex & 8 ) /* REX.W */
-        op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */
+        op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
     modrm_reg = (rex & 4) << 1;  /* REX.R */
     /* REX.X does not need to be decoded. */
     modrm_rm  = (rex & 1) << 3;  /* REX.B */
@@ -1554,7 +1666,8 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         {
             if ( !read_descriptor(data_sel, v, regs,
                                   &data_base, &data_limit, &ar,
-                                  _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) )
+                                  _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|
+                                  _SEGMENT_P) )
                 goto fail;
             if ( !(ar & _SEGMENT_S) ||
                  !(ar & _SEGMENT_P) ||
@@ -1601,69 +1714,39 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case 0x6c: /* INSB */
             op_bytes = 1;
         case 0x6d: /* INSW/INSL */
-            if ( data_limit < op_bytes - 1 ||
-                 rd_ad(edi) > data_limit - (op_bytes - 1) ||
+            if ( (data_limit < (op_bytes - 1)) ||
+                 (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
                  !guest_io_okay(port, op_bytes, v, regs) )
                 goto fail;
-            switch ( op_bytes )
-            {
-            case 1:
-                /* emulate PIT counter 2 */
-                data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) : 
-                       ((port == 0x42 || port == 0x43 || port == 0x61) ?
-                       pv_pit_handler(port, 0, 0) : ~0)); 
-                break;
-            case 2:
-                data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0);
-                break;
-            case 4:
-                data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0);
-                break;
-            }
-            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 )
+            data = guest_io_read(port, op_bytes, v, regs);
+            if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
+                                    &data, op_bytes)) != 0 )
             {
                 propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc,
                                      PFEC_write_access);
                 return EXCRET_fault_fixed;
             }
-            wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
+            wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF)
+                                         ? -op_bytes : op_bytes));
             break;
 
         case 0x6e: /* OUTSB */
             op_bytes = 1;
         case 0x6f: /* OUTSW/OUTSL */
-            if ( data_limit < op_bytes - 1 ||
-                 rd_ad(esi) > data_limit - (op_bytes - 1) ||
-                 !guest_io_okay(port, op_bytes, v, regs) )
+            if ( (data_limit < (op_bytes - 1)) ||
+                 (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
+                  !guest_io_okay(port, op_bytes, v, regs) )
                 goto fail;
-            rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes);
-            if ( rc != 0 )
+            if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
+                                      op_bytes)) != 0 )
             {
-                propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0);
+                propagate_page_fault(data_base + rd_ad(esi)
+                                     + op_bytes - rc, 0);
                 return EXCRET_fault_fixed;
             }
-            switch ( op_bytes )
-            {
-            case 1:
-                if ( guest_outb_okay(port, v, regs) )
-                {
-                    outb((u8)data, port);
-                    if ( pv_post_outb_hook )
-                        pv_post_outb_hook(port, data);
-                }
-                else if ( port == 0x42 || port == 0x43 || port == 0x61 )
-                    pv_pit_handler(port, data, 1);
-                break;
-            case 2:
-                if ( guest_outw_okay(port, v, regs) )
-                    outw((u16)data, port);
-                break;
-            case 4:
-                if ( guest_outl_okay(port, v, regs) )
-                    outl((u32)data, port);
-                break;
-            }
-            wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes));
+            guest_io_write(port, op_bytes, data, v, regs);
+            wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF)
+                                         ? -op_bytes : op_bytes));
             break;
         }
 
@@ -1727,31 +1810,17 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
     exec_in:
         if ( !guest_io_okay(port, op_bytes, v, regs) )
             goto fail;
-        switch ( op_bytes )
+        if ( admin_io_okay(port, op_bytes, v, regs) )
         {
-        case 1:
-            if ( guest_inb_okay(port, v, regs) )
-                io_emul(regs);
-            else if ( port == 0x42 || port == 0x43 || port == 0x61 )
-            {
-                regs->eax &= ~0xffUL;
-                regs->eax |= pv_pit_handler(port, 0, 0);
-            } 
-            else
-                regs->eax |= (u8)~0;
-            break;
-        case 2:
-            if ( guest_inw_okay(port, v, regs) )
-                io_emul(regs);
-            else
-                regs->eax |= (u16)~0;
-            break;
-        case 4:
-            if ( guest_inl_okay(port, v, regs) )
-                io_emul(regs);
+            io_emul(regs);            
+        }
+        else
+        {
+            if ( op_bytes == 4 )
+                regs->eax = 0;
             else
-                regs->eax = (u32)~0;
-            break;
+                regs->eax &= ~((1u << (op_bytes * 8)) - 1);
+            regs->eax |= guest_io_read(port, op_bytes, v, regs);
         }
         bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
         goto done;
@@ -1770,26 +1839,15 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
     exec_out:
         if ( !guest_io_okay(port, op_bytes, v, regs) )
             goto fail;
-        switch ( op_bytes )
+        if ( admin_io_okay(port, op_bytes, v, regs) )
         {
-        case 1:
-            if ( guest_outb_okay(port, v, regs) )
-            {
-                io_emul(regs);
-                if ( pv_post_outb_hook )
-                    pv_post_outb_hook(port, regs->eax);
-            }
-            else if ( port == 0x42 || port == 0x43 || port == 0x61 )
-                pv_pit_handler(port, regs->eax, 1);
-            break;
-        case 2:
-            if ( guest_outw_okay(port, v, regs) )
-                io_emul(regs);
-            break;
-        case 4:
-            if ( guest_outl_okay(port, v, regs) )
-                io_emul(regs);
-            break;
+            io_emul(regs);            
+            if ( (op_bytes == 1) && pv_post_outb_hook )
+                pv_post_outb_hook(port, regs->eax);
+        }
+        else
+        {
+            guest_io_write(port, op_bytes, regs->eax, v, regs);
         }
         bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
         goto done;
@@ -1921,14 +1979,14 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             break;
 
         case 3: /* Write CR3 */
-            LOCK_BIGLOCK(v->domain);
+            domain_lock(v->domain);
             if ( !is_pv_32on64_vcpu(v) )
                 rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg)));
 #ifdef CONFIG_COMPAT
             else
                 rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg)));
 #endif
-            UNLOCK_BIGLOCK(v->domain);
+            domain_unlock(v->domain);
             if ( rc == 0 ) /* not okay */
                 goto fail;
             break;
@@ -2137,8 +2195,8 @@ static void emulate_gate_op(struct cpu_user_regs *regs)
 
     /* Check whether this fault is due to the use of a call gate. */
     if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
-         ((ar >> 13) & 3) < (regs->cs & 3) ||
-         (ar & _SEGMENT_TYPE) != 0xc00 )
+         (((ar >> 13) & 3) < (regs->cs & 3)) ||
+         ((ar & _SEGMENT_TYPE) != 0xc00) )
     {
         do_guest_trap(TRAP_gp_fault, regs, 1);
         return;
@@ -2232,15 +2290,18 @@ static void emulate_gate_op(struct cpu_user_regs *regs)
                     {
                         if ( (modrm & 7) == 4 )
                         {
-                            unsigned int sib = insn_fetch(u8, base, eip, limit);
+                            unsigned int sib;
+                            sib = insn_fetch(u8, base, eip, limit);
 
                             modrm = (modrm & ~7) | (sib & 7);
                             if ( (sib >>= 3) != 4 )
-                                opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0);
+                                opnd_off = *(unsigned long *)
+                                    decode_register(sib & 7, regs, 0);
                             opnd_off <<= sib >> 3;
                         }
                         if ( (modrm & 7) != 5 || (modrm & 0xc0) )
-                            opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0);
+                            opnd_off += *(unsigned long *)
+                                decode_register(modrm & 7, regs, 0);
                         else
                             modrm |= 0x87;
                         if ( !opnd_sel )
@@ -2576,12 +2637,14 @@ asmlinkage void do_general_protection(struct cpu_user_regs *regs)
     panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
 }
 
-static void nmi_softirq(void)
+static void nmi_action(unsigned long unused)
 {
     /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
     vcpu_kick(dom0->vcpu[0]);
 }
 
+static DECLARE_TASKLET(nmi_tasklet, nmi_action, 0);
+
 static void nmi_dom0_report(unsigned int reason_idx)
 {
     struct domain *d;
@@ -2593,7 +2656,7 @@ static void nmi_dom0_report(unsigned int reason_idx)
     set_bit(reason_idx, nmi_reason(d));
 
     if ( !test_and_set_bool(v->nmi_pending) )
-        raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
+        tasklet_schedule(&nmi_tasklet); /* not safe to wake a vcpu here */
 }
 
 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
@@ -2871,8 +2934,6 @@ void __init trap_init(void)
     percpu_traps_init();
 
     cpu_init();
-
-    open_softirq(NMI_SOFTIRQ, nmi_softirq);
 }
 
 long register_guest_nmi_callback(unsigned long address)
diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
index 256f7a5ac8..a1de1bab27 100644
--- a/xen/arch/x86/x86_64/compat/mm.c
+++ b/xen/arch/x86/x86_64/compat/mm.c
@@ -28,12 +28,12 @@ int compat_set_gdt(XEN_GUEST_HANDLE(uint) frame_list, unsigned int entries)
         guest_handle_add_offset(frame_list, 1);
     }
 
-    LOCK_BIGLOCK(current->domain);
+    domain_lock(current->domain);
 
     if ( (ret = set_gdt(current, frames, entries)) == 0 )
         flush_tlb_local();
 
-    UNLOCK_BIGLOCK(current->domain);
+    domain_unlock(current->domain);
 
     return ret;
 }
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index f9f33e0a88..3d79657989 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -59,7 +59,7 @@ void *alloc_xen_pagetable(void)
 
     if ( !early_boot )
     {
-        struct page_info *pg = alloc_domheap_page(NULL);
+        struct page_info *pg = alloc_domheap_page(NULL, 0);
         BUG_ON(pg == NULL);
         return page_to_virt(pg);
     }
@@ -108,7 +108,7 @@ void __init paging_init(void)
     struct page_info *l1_pg, *l2_pg, *l3_pg;
 
     /* Create user-accessible L2 directory to map the MPT for guests. */
-    if ( (l3_pg = alloc_domheap_page(NULL)) == NULL )
+    if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL )
         goto nomem;
     l3_ro_mpt = page_to_virt(l3_pg);
     clear_page(l3_ro_mpt);
@@ -134,7 +134,7 @@ void __init paging_init(void)
                1UL << L2_PAGETABLE_SHIFT);
         if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
         {
-            if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
+            if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
                 goto nomem;
             va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
             l2_ro_mpt = page_to_virt(l2_pg);
@@ -154,7 +154,7 @@ void __init paging_init(void)
                  l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
         HIRO_COMPAT_MPT_VIRT_START)]);
-    if ( (l2_pg = alloc_domheap_page(NULL)) == NULL )
+    if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
         goto nomem;
     compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg);
     clear_page(l2_ro_mpt);