diff options
Diffstat (limited to 'xen/arch/x86')
38 files changed, 1814 insertions, 1016 deletions
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index 086a7b530f..334a996eb6 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -31,6 +31,7 @@ obj-y += mm.o obj-y += mpparse.o obj-y += nmi.o obj-y += numa.o +obj-y += pci.o obj-y += physdev.o obj-y += rwlock.o obj-y += setup.o diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c index cfe87671e9..9a17d61e3b 100644 --- a/xen/arch/x86/acpi/boot.c +++ b/xen/arch/x86/acpi/boot.c @@ -374,6 +374,18 @@ extern u32 pmtmr_ioport; #endif #ifdef CONFIG_ACPI_SLEEP +#define acpi_fadt_copy_address(dst, src, len) do { \ + if (fadt->header.revision >= FADT2_REVISION_ID) \ + acpi_sinfo.dst##_blk = fadt->x##src##_block; \ + if (!acpi_sinfo.dst##_blk.address) { \ + acpi_sinfo.dst##_blk.address = fadt->src##_block; \ + acpi_sinfo.dst##_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO; \ + acpi_sinfo.dst##_blk.bit_width = fadt->len##_length << 3; \ + acpi_sinfo.dst##_blk.bit_offset = 0; \ + acpi_sinfo.dst##_blk.access_width = 0; \ + } \ +} while (0) + /* Get pm1x_cnt and pm1x_evt information for ACPI sleep */ static void __init acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt) @@ -388,37 +400,18 @@ acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt) goto bad; rsdp = __va(rsdp_phys); - if (fadt->header.revision >= FADT2_REVISION_ID) { - memcpy(&acpi_sinfo.pm1a_cnt_blk, &fadt->xpm1a_control_block, - sizeof(struct acpi_generic_address)); - memcpy(&acpi_sinfo.pm1b_cnt_blk, &fadt->xpm1b_control_block, - sizeof(struct acpi_generic_address)); - memcpy(&acpi_sinfo.pm1a_evt_blk, &fadt->xpm1a_event_block, - sizeof(struct acpi_generic_address)); - memcpy(&acpi_sinfo.pm1b_evt_blk, &fadt->xpm1b_event_block, - sizeof(struct acpi_generic_address)); - } else { - acpi_sinfo.pm1a_cnt_blk.address = fadt->pm1a_control_block; - acpi_sinfo.pm1b_cnt_blk.address = fadt->pm1b_control_block; - acpi_sinfo.pm1a_evt_blk.address = fadt->pm1a_event_block; - acpi_sinfo.pm1b_evt_blk.address = fadt->pm1b_event_block; - acpi_sinfo.pm1a_cnt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO; - acpi_sinfo.pm1b_cnt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO; - acpi_sinfo.pm1a_evt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO; - acpi_sinfo.pm1b_evt_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO; - acpi_sinfo.pm1a_cnt_blk.bit_width = 16; - acpi_sinfo.pm1b_cnt_blk.bit_width = 16; - acpi_sinfo.pm1a_evt_blk.bit_width = 16; - acpi_sinfo.pm1b_evt_blk.bit_width = 16; - acpi_sinfo.pm1a_cnt_blk.bit_offset = 0; - acpi_sinfo.pm1b_cnt_blk.bit_offset = 0; - acpi_sinfo.pm1a_evt_blk.bit_offset = 0; - acpi_sinfo.pm1b_evt_blk.bit_offset = 0; - acpi_sinfo.pm1a_cnt_blk.access_width = 0; - acpi_sinfo.pm1b_cnt_blk.access_width = 0; - acpi_sinfo.pm1a_evt_blk.access_width = 0; - acpi_sinfo.pm1b_evt_blk.access_width = 0; - } + acpi_fadt_copy_address(pm1a_cnt, pm1a_control, pm1_control); + acpi_fadt_copy_address(pm1b_cnt, pm1b_control, pm1_control); + acpi_fadt_copy_address(pm1a_evt, pm1a_event, pm1_event); + acpi_fadt_copy_address(pm1b_evt, pm1b_event, pm1_event); + + printk(KERN_INFO PREFIX + "ACPI SLEEP INFO: pm1x_cnt[%"PRIx64",%"PRIx64"], " + "pm1x_evt[%"PRIx64",%"PRIx64"]\n", + acpi_sinfo.pm1a_cnt_blk.address, + acpi_sinfo.pm1b_cnt_blk.address, + acpi_sinfo.pm1a_evt_blk.address, + acpi_sinfo.pm1b_evt_blk.address); /* Now FACS... */ if (fadt->header.revision >= FADT2_REVISION_ID) @@ -461,13 +454,6 @@ acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt) } printk(KERN_INFO PREFIX - "ACPI SLEEP INFO: pm1x_cnt[%"PRIx64",%"PRIx64"], " - "pm1x_evt[%"PRIx64",%"PRIx64"]\n", - acpi_sinfo.pm1a_cnt_blk.address, - acpi_sinfo.pm1b_cnt_blk.address, - acpi_sinfo.pm1a_evt_blk.address, - acpi_sinfo.pm1b_evt_blk.address); - printk(KERN_INFO PREFIX " wakeup_vec[%"PRIx64"], vec_size[%x]\n", acpi_sinfo.wakeup_vector, acpi_sinfo.vector_width); return; diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c index 909a73f3fa..f0253152bc 100644 --- a/xen/arch/x86/cpu/amd.c +++ b/xen/arch/x86/cpu/amd.c @@ -3,6 +3,7 @@ #include <xen/bitops.h> #include <xen/mm.h> #include <xen/smp.h> +#include <xen/pci.h> #include <asm/io.h> #include <asm/msr.h> #include <asm/processor.h> @@ -66,19 +67,6 @@ static int c1_ramping_may_cause_clock_drift(struct cpuinfo_x86 *c) return 1; } -/* PCI access functions. Should be safe to use 0xcf8/0xcfc port accesses here. */ -static u8 pci_read_byte(u32 bus, u32 dev, u32 fn, u32 reg) -{ - outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8); - return inb(0xcfc + (reg & 3)); -} - -static void pci_write_byte(u32 bus, u32 dev, u32 fn, u32 reg, u8 val) -{ - outl((1U<<31) | (bus << 16) | (dev << 11) | (fn << 8) | (reg & ~3), 0xcf8); - outb(val, 0xcfc + (reg & 3)); -} - /* * Disable C1-Clock ramping if enabled in PMM7.CpuLowPwrEnh on 8th-generation * cores only. Assume BIOS has setup all Northbridges equivalently. @@ -90,12 +78,12 @@ static void disable_c1_ramping(void) for (node=0; node < NR_CPUS; node++) { /* PMM7: bus=0, dev=0x18+node, function=0x3, register=0x87. */ - pmm7 = pci_read_byte(0, 0x18+node, 0x3, 0x87); + pmm7 = pci_conf_read8(0, 0x18+node, 0x3, 0x87); /* Invalid read means we've updated every Northbridge. */ if (pmm7 == 0xFF) break; pmm7 &= 0xFC; /* clear pmm7[1:0] */ - pci_write_byte(0, 0x18+node, 0x3, 0x87, pmm7); + pci_conf_write8(0, 0x18+node, 0x3, 0x87, pmm7); printk ("AMD: Disabling C1 Clock Ramping Node #%x\n", node); } } diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index c56db37b37..4418c51ff9 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -46,6 +46,7 @@ #include <asm/debugreg.h> #include <asm/msr.h> #include <asm/nmi.h> +#include <xen/numa.h> #include <xen/iommu.h> #ifdef CONFIG_COMPAT #include <compat/vcpu.h> @@ -171,7 +172,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab) if ( !d->arch.mm_arg_xlat_l3 ) { - pg = alloc_domheap_page(NULL); + pg = alloc_domheap_page(NULL, 0); if ( !pg ) return -ENOMEM; d->arch.mm_arg_xlat_l3 = page_to_virt(pg); @@ -189,7 +190,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab) if ( !l3e_get_intpte(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]) ) { - pg = alloc_domheap_page(NULL); + pg = alloc_domheap_page(NULL, 0); if ( !pg ) return -ENOMEM; clear_page(page_to_virt(pg)); @@ -198,7 +199,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab) l2tab = l3e_to_l2e(d->arch.mm_arg_xlat_l3[l3_table_offset(va)]); if ( !l2e_get_intpte(l2tab[l2_table_offset(va)]) ) { - pg = alloc_domheap_page(NULL); + pg = alloc_domheap_page(NULL, 0); if ( !pg ) return -ENOMEM; clear_page(page_to_virt(pg)); @@ -206,7 +207,7 @@ int setup_arg_xlat_area(struct vcpu *v, l4_pgentry_t *l4tab) } l1tab = l2e_to_l1e(l2tab[l2_table_offset(va)]); BUG_ON(l1e_get_intpte(l1tab[l1_table_offset(va)])); - pg = alloc_domheap_page(NULL); + pg = alloc_domheap_page(NULL, 0); if ( !pg ) return -ENOMEM; l1tab[l1_table_offset(va)] = l1e_from_page(pg, PAGE_HYPERVISOR); @@ -252,7 +253,7 @@ static void release_arg_xlat_area(struct domain *d) static int setup_compat_l4(struct vcpu *v) { - struct page_info *pg = alloc_domheap_page(NULL); + struct page_info *pg = alloc_domheap_page(NULL, 0); l4_pgentry_t *l4tab; int rc; @@ -477,7 +478,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) #else /* __x86_64__ */ - if ( (pg = alloc_domheap_page(NULL)) == NULL ) + pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); + if ( pg == NULL ) goto fail; d->arch.mm_perdomain_l2 = page_to_virt(pg); clear_page(d->arch.mm_perdomain_l2); @@ -486,7 +488,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i, __PAGE_HYPERVISOR); - if ( (pg = alloc_domheap_page(NULL)) == NULL ) + pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); + if ( pg == NULL ) goto fail; d->arch.mm_perdomain_l3 = page_to_virt(pg); clear_page(d->arch.mm_perdomain_l3); @@ -500,13 +503,15 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START; #endif - paging_domain_init(d); + if ( (rc = paging_domain_init(d)) != 0 ) + goto fail; paging_initialised = 1; if ( !is_idle_domain(d) ) { d->arch.ioport_caps = rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); + rc = -ENOMEM; if ( d->arch.ioport_caps == NULL ) goto fail; @@ -946,9 +951,9 @@ arch_do_vcpu_op( if ( copy_from_guest(&info, arg, 1) ) break; - LOCK_BIGLOCK(d); + domain_lock(d); rc = map_vcpu_info(v, info.mfn, info.offset); - UNLOCK_BIGLOCK(d); + domain_unlock(d); break; } diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index dc8ee52f07..56106bae2f 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -630,7 +630,7 @@ int __init construct_dom0( } else { - page = alloc_domheap_page(NULL); + page = alloc_domheap_page(NULL, 0); if ( !page ) panic("Not enough RAM for domain 0 PML4.\n"); l4start = l4tab = page_to_virt(page); @@ -957,6 +957,8 @@ int __init construct_dom0( rc |= ioports_deny_access(dom0, 0x40, 0x43); /* PIT Channel 2 / PC Speaker Control. */ rc |= ioports_deny_access(dom0, 0x61, 0x61); + /* PCI configuration spaces. */ + rc |= ioports_deny_access(dom0, 0xcf8, 0xcff); /* Command-line I/O ranges. */ process_dom0_ioports_disable(); diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c index 57065f7625..d7bf9f3f2f 100644 --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -20,12 +20,13 @@ #include <asm/hvm/support.h> static int hvmemul_do_io( - int is_mmio, paddr_t addr, unsigned long count, int size, + int is_mmio, paddr_t addr, unsigned long *reps, int size, paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) { struct vcpu *curr = current; vcpu_iodata_t *vio = get_ioreq(curr); ioreq_t *p = &vio->vp_ioreq; + int rc; switch ( curr->arch.hvm_vcpu.io_state ) { @@ -41,52 +42,72 @@ static int hvmemul_do_io( return X86EMUL_UNHANDLEABLE; } - curr->arch.hvm_vcpu.io_state = - (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; - if ( p->state != STATE_IOREQ_NONE ) + { gdprintk(XENLOG_WARNING, "WARNING: io already pending (%d)?\n", p->state); + return X86EMUL_UNHANDLEABLE; + } + + curr->arch.hvm_vcpu.io_state = + (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; p->dir = dir; p->data_is_ptr = value_is_ptr; p->type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO; p->size = size; p->addr = addr; - p->count = count; + p->count = *reps; p->df = df; p->data = value; p->io_count++; - if ( is_mmio - ? (hvm_mmio_intercept(p) || hvm_buffered_io_intercept(p)) - : hvm_portio_intercept(p) ) + if ( is_mmio ) { + rc = hvm_mmio_intercept(p); + if ( rc == X86EMUL_UNHANDLEABLE ) + rc = hvm_buffered_io_intercept(p); + } + else + { + rc = hvm_portio_intercept(p); + } + + switch ( rc ) + { + case X86EMUL_OKAY: + case X86EMUL_RETRY: + *reps = p->count; p->state = STATE_IORESP_READY; hvm_io_assist(); if ( val != NULL ) *val = curr->arch.hvm_vcpu.io_data; curr->arch.hvm_vcpu.io_state = HVMIO_none; - return X86EMUL_OKAY; + break; + case X86EMUL_UNHANDLEABLE: + hvm_send_assist_req(curr); + rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY; + break; + default: + BUG(); } - hvm_send_assist_req(curr); - return (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY; + return rc; } static int hvmemul_do_pio( - unsigned long port, unsigned long count, int size, + unsigned long port, unsigned long *reps, int size, paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) { - return hvmemul_do_io(0, port, count, size, value, + return hvmemul_do_io(0, port, reps, size, value, dir, df, value_is_ptr, val); } static int hvmemul_do_mmio( - paddr_t gpa, unsigned long count, int size, + paddr_t gpa, unsigned long *reps, int size, paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) { - return hvmemul_do_io(1, gpa, count, size, value, + return hvmemul_do_io(1, gpa, reps, size, value, dir, df, value_is_ptr, val); } @@ -206,7 +227,7 @@ static int __hvmemul_read( struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; - unsigned long addr; + unsigned long addr, reps = 1; uint32_t pfec = PFEC_page_present; paddr_t gpa; int rc; @@ -226,7 +247,8 @@ static int __hvmemul_read( return X86EMUL_UNHANDLEABLE; gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off); if ( (off + bytes) <= PAGE_SIZE ) - return hvmemul_do_mmio(gpa, 1, bytes, 0, IOREQ_READ, 0, 0, val); + return hvmemul_do_mmio(gpa, &reps, bytes, 0, + IOREQ_READ, 0, 0, val); } if ( (seg != x86_seg_none) && @@ -251,7 +273,7 @@ static int __hvmemul_read( if ( rc != X86EMUL_OKAY ) return rc; - return hvmemul_do_mmio(gpa, 1, bytes, 0, IOREQ_READ, 0, 0, val); + return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val); } return X86EMUL_OKAY; @@ -302,7 +324,7 @@ static int hvmemul_write( struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); struct vcpu *curr = current; - unsigned long addr; + unsigned long addr, reps = 1; uint32_t pfec = PFEC_page_present | PFEC_write_access; paddr_t gpa; int rc; @@ -318,8 +340,8 @@ static int hvmemul_write( unsigned int off = addr & (PAGE_SIZE - 1); gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off); if ( (off + bytes) <= PAGE_SIZE ) - return hvmemul_do_mmio(gpa, 1, bytes, val, IOREQ_WRITE, - 0, 0, NULL); + return hvmemul_do_mmio(gpa, &reps, bytes, val, + IOREQ_WRITE, 0, 0, NULL); } if ( (seg != x86_seg_none) && @@ -339,7 +361,8 @@ static int hvmemul_write( if ( rc != X86EMUL_OKAY ) return rc; - return hvmemul_do_mmio(gpa, 1, bytes, val, IOREQ_WRITE, 0, 0, NULL); + return hvmemul_do_mmio(gpa, &reps, bytes, val, + IOREQ_WRITE, 0, 0, NULL); } return X86EMUL_OKAY; @@ -386,7 +409,7 @@ static int hvmemul_rep_ins( if ( rc != X86EMUL_OKAY ) return rc; - return hvmemul_do_pio(src_port, *reps, bytes_per_rep, gpa, IOREQ_READ, + return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ, !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); } @@ -419,7 +442,7 @@ static int hvmemul_rep_outs( if ( rc != X86EMUL_OKAY ) return rc; - return hvmemul_do_pio(dst_port, *reps, bytes_per_rep, gpa, IOREQ_WRITE, + return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE, !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); } @@ -469,14 +492,14 @@ static int hvmemul_rep_movs( (void)gfn_to_mfn_current(sgpa >> PAGE_SHIFT, &p2mt); if ( !p2m_is_ram(p2mt) ) return hvmemul_do_mmio( - sgpa, *reps, bytes_per_rep, dgpa, IOREQ_READ, + sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt); if ( p2m_is_ram(p2mt) ) return X86EMUL_UNHANDLEABLE; return hvmemul_do_mmio( - dgpa, *reps, bytes_per_rep, sgpa, IOREQ_WRITE, + dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE, !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); } @@ -513,7 +536,8 @@ static int hvmemul_read_io( unsigned long *val, struct x86_emulate_ctxt *ctxt) { - return hvmemul_do_pio(port, 1, bytes, 0, IOREQ_READ, 0, 0, val); + unsigned long reps = 1; + return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val); } static int hvmemul_write_io( @@ -522,7 +546,8 @@ static int hvmemul_write_io( unsigned long val, struct x86_emulate_ctxt *ctxt) { - return hvmemul_do_pio(port, 1, bytes, val, IOREQ_WRITE, 0, 0, NULL); + unsigned long reps = 1; + return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL); } static int hvmemul_read_cr( diff --git a/xen/arch/x86/hvm/hpet.c b/xen/arch/x86/hvm/hpet.c index 49ca998d37..03dfbf3bd8 100644 --- a/xen/arch/x86/hvm/hpet.c +++ b/xen/arch/x86/hvm/hpet.c @@ -150,8 +150,9 @@ static inline uint64_t hpet_read_maincounter(HPETState *h) return h->hpet.mc64; } -static unsigned long hpet_read( - struct vcpu *v, unsigned long addr, unsigned long length) +static int hpet_read( + struct vcpu *v, unsigned long addr, unsigned long length, + unsigned long *pval) { HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet; unsigned long result; @@ -160,7 +161,10 @@ static unsigned long hpet_read( addr &= HPET_MMAP_SIZE-1; if ( hpet_check_access_length(addr, length) != 0 ) - return ~0UL; + { + result = ~0ul; + goto out; + } spin_lock(&h->lock); @@ -174,7 +178,9 @@ static unsigned long hpet_read( spin_unlock(&h->lock); - return result; + out: + *pval = result; + return X86EMUL_OKAY; } static void hpet_stop_timer(HPETState *h, unsigned int tn) @@ -234,7 +240,7 @@ static inline uint64_t hpet_fixup_reg( return new; } -static void hpet_write( +static int hpet_write( struct vcpu *v, unsigned long addr, unsigned long length, unsigned long val) { @@ -245,7 +251,7 @@ static void hpet_write( addr &= HPET_MMAP_SIZE-1; if ( hpet_check_access_length(addr, length) != 0 ) - return; + goto out; spin_lock(&h->lock); @@ -349,6 +355,9 @@ static void hpet_write( } spin_unlock(&h->lock); + + out: + return X86EMUL_OKAY; } static int hpet_range(struct vcpu *v, unsigned long addr) diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index 961bfbf354..97a1aaa17c 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -181,7 +181,8 @@ void hvm_do_resume(struct vcpu *v) break; default: gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state); - domain_crash_synchronous(); + domain_crash(v->domain); + return; /* bail */ } } } @@ -276,7 +277,7 @@ static int hvm_print_line( } spin_unlock(&hd->pbuf_lock); - return 1; + return X86EMUL_OKAY; } int hvm_domain_initialise(struct domain *d) @@ -478,11 +479,11 @@ static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h) vc = &v->arch.guest_context; /* Need to init this vcpu before loading its contents */ - LOCK_BIGLOCK(d); + domain_lock(d); if ( !v->is_initialised ) if ( (rc = boot_vcpu(d, vcpuid, vc)) != 0 ) return rc; - UNLOCK_BIGLOCK(d); + domain_unlock(d); if ( hvm_load_entry(CPU, h, &ctxt) != 0 ) return -EINVAL; @@ -687,47 +688,26 @@ void hvm_vcpu_destroy(struct vcpu *v) /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/ } - -void hvm_vcpu_reset(struct vcpu *v) -{ - vcpu_pause(v); - - vlapic_reset(vcpu_vlapic(v)); - - hvm_funcs.vcpu_initialise(v); - - set_bit(_VPF_down, &v->pause_flags); - clear_bit(_VPF_blocked, &v->pause_flags); - v->fpu_initialised = 0; - v->fpu_dirtied = 0; - v->is_initialised = 0; - - vcpu_unpause(v); -} - -static void hvm_vcpu_down(void) +void hvm_vcpu_down(struct vcpu *v) { - struct vcpu *v = current; struct domain *d = v->domain; int online_count = 0; - gdprintk(XENLOG_INFO, "VCPU%d: going offline.\n", v->vcpu_id); - /* Doesn't halt us immediately, but we'll never return to guest context. */ set_bit(_VPF_down, &v->pause_flags); vcpu_sleep_nosync(v); /* Any other VCPUs online? ... */ - LOCK_BIGLOCK(d); + domain_lock(d); for_each_vcpu ( d, v ) if ( !test_bit(_VPF_down, &v->pause_flags) ) online_count++; - UNLOCK_BIGLOCK(d); + domain_unlock(d); /* ... Shut down the domain if not. */ if ( online_count == 0 ) { - gdprintk(XENLOG_INFO, "all CPUs offline -- powering off.\n"); + gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n"); domain_shutdown(d, SHUTDOWN_poweroff); } } @@ -742,9 +722,10 @@ void hvm_send_assist_req(struct vcpu *v) p = &get_ioreq(v)->vp_ioreq; if ( unlikely(p->state != STATE_IOREQ_NONE) ) { - /* This indicates a bug in the device model. Crash the domain. */ + /* This indicates a bug in the device model. Crash the domain. */ gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state); - domain_crash_synchronous(); + domain_crash(v->domain); + return; } prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port); @@ -765,7 +746,7 @@ void hvm_hlt(unsigned long rflags) * out of this. */ if ( unlikely(!(rflags & X86_EFLAGS_IF)) ) - return hvm_vcpu_down(); + return hvm_vcpu_down(current); do_sched_op_compat(SCHEDOP_block, 0); } @@ -1894,79 +1875,6 @@ void hvm_hypercall_page_initialise(struct domain *d, hvm_funcs.init_hypercall_page(d, hypercall_page); } -int hvm_bringup_ap(int vcpuid, int trampoline_vector) -{ - struct domain *d = current->domain; - struct vcpu *v; - struct vcpu_guest_context *ctxt; - struct segment_register reg; - - ASSERT(is_hvm_domain(d)); - - if ( (v = d->vcpu[vcpuid]) == NULL ) - return -ENOENT; - - v->fpu_initialised = 0; - v->arch.flags |= TF_kernel_mode; - v->is_initialised = 1; - - ctxt = &v->arch.guest_context; - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->flags = VGCF_online; - ctxt->user_regs.eflags = 2; - - v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET; - hvm_update_guest_cr(v, 0); - - v->arch.hvm_vcpu.guest_cr[2] = 0; - hvm_update_guest_cr(v, 2); - - v->arch.hvm_vcpu.guest_cr[3] = 0; - hvm_update_guest_cr(v, 3); - - v->arch.hvm_vcpu.guest_cr[4] = 0; - hvm_update_guest_cr(v, 4); - - v->arch.hvm_vcpu.guest_efer = 0; - hvm_update_guest_efer(v); - - reg.sel = trampoline_vector << 8; - reg.base = (uint32_t)reg.sel << 4; - reg.limit = 0xffff; - reg.attr.bytes = 0x89b; - hvm_set_segment_register(v, x86_seg_cs, ®); - - reg.sel = reg.base = 0; - reg.limit = 0xffff; - reg.attr.bytes = 0x893; - hvm_set_segment_register(v, x86_seg_ds, ®); - hvm_set_segment_register(v, x86_seg_es, ®); - hvm_set_segment_register(v, x86_seg_fs, ®); - hvm_set_segment_register(v, x86_seg_gs, ®); - hvm_set_segment_register(v, x86_seg_ss, ®); - - reg.attr.bytes = 0x82; /* LDT */ - hvm_set_segment_register(v, x86_seg_ldtr, ®); - - reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */ - hvm_set_segment_register(v, x86_seg_tr, ®); - - reg.attr.bytes = 0; - hvm_set_segment_register(v, x86_seg_gdtr, ®); - hvm_set_segment_register(v, x86_seg_idtr, ®); - - /* Sync AP's TSC with BSP's. */ - v->arch.hvm_vcpu.cache_tsc_offset = - v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset; - hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); - - if ( test_and_clear_bit(_VPF_down, &v->pause_flags) ) - vcpu_wake(v); - - gdprintk(XENLOG_INFO, "AP %d bringup succeeded.\n", vcpuid); - return 0; -} - static int hvmop_set_pci_intx_level( XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t) uop) { @@ -2185,13 +2093,16 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg) if ( op == HVMOP_set_param ) { + rc = 0; + switch ( a.index ) { case HVM_PARAM_IOREQ_PFN: iorp = &d->arch.hvm_domain.ioreq; - rc = hvm_set_ioreq_page(d, iorp, a.value); + if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 ) + break; spin_lock(&iorp->lock); - if ( (rc == 0) && (iorp->va != NULL) ) + if ( iorp->va != NULL ) /* Initialise evtchn port info if VCPUs already created. */ for_each_vcpu ( d, v ) get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port; @@ -2206,13 +2117,72 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg) hvm_latch_shinfo_size(d); break; case HVM_PARAM_TIMER_MODE: - rc = -EINVAL; if ( a.value > HVMPTM_one_missed_tick_pending ) - goto param_fail; + rc = -EINVAL; + break; + case HVM_PARAM_IDENT_PT: + rc = -EPERM; + if ( !IS_PRIV(current->domain) ) + break; + + rc = -EINVAL; + if ( d->arch.hvm_domain.params[a.index] != 0 ) + break; + + rc = 0; + if ( !paging_mode_hap(d) ) + break; + + domain_pause(d); + + /* + * Update GUEST_CR3 in each VMCS to point at identity map. + * All foreign updates to guest state must synchronise on + * the domctl_lock. + */ + spin_lock(&domctl_lock); + d->arch.hvm_domain.params[a.index] = a.value; + for_each_vcpu ( d, v ) + paging_update_cr3(v); + spin_unlock(&domctl_lock); + + domain_unpause(d); + break; + case HVM_PARAM_DM_DOMAIN: + /* Privileged domains only, as we must domain_pause(d). */ + rc = -EPERM; + if ( !IS_PRIV_FOR(current->domain, d) ) + break; + + if ( a.value == DOMID_SELF ) + a.value = current->domain->domain_id; + + rc = 0; + domain_pause(d); /* safe to change per-vcpu xen_port */ + iorp = &d->arch.hvm_domain.ioreq; + for_each_vcpu ( d, v ) + { + int old_port, new_port; + new_port = alloc_unbound_xen_event_channel(v, a.value); + if ( new_port < 0 ) + { + rc = new_port; + break; + } + /* xchg() ensures that only we free_xen_event_channel() */ + old_port = xchg(&v->arch.hvm_vcpu.xen_port, new_port); + free_xen_event_channel(v, old_port); + spin_lock(&iorp->lock); + if ( iorp->va != NULL ) + get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port; + spin_unlock(&iorp->lock); + } + domain_unpause(d); break; } - d->arch.hvm_domain.params[a.index] = a.value; - rc = 0; + + if ( rc == 0 ) + d->arch.hvm_domain.params[a.index] = a.value; } else { diff --git a/xen/arch/x86/hvm/i8254.c b/xen/arch/x86/hvm/i8254.c index 01c78f7799..493b7317b9 100644 --- a/xen/arch/x86/hvm/i8254.c +++ b/xen/arch/x86/hvm/i8254.c @@ -401,50 +401,6 @@ void pit_stop_channel0_irq(PITState *pit) spin_unlock(&pit->lock); } -#ifdef HVM_DEBUG_SUSPEND -static void pit_info(PITState *pit) -{ - struct hvm_hw_pit_channel *s; - struct periodic_time *pt; - int i; - - for ( i = 0; i < 3; i++ ) - { - printk("*****pit channel %d's state:*****\n", i); - s = &pit->hw.channels[i]; - printk("pit 0x%x.\n", s->count); - printk("pit 0x%x.\n", s->latched_count); - printk("pit 0x%x.\n", s->count_latched); - printk("pit 0x%x.\n", s->status_latched); - printk("pit 0x%x.\n", s->status); - printk("pit 0x%x.\n", s->read_state); - printk("pit 0x%x.\n", s->write_state); - printk("pit 0x%x.\n", s->write_latch); - printk("pit 0x%x.\n", s->rw_mode); - printk("pit 0x%x.\n", s->mode); - printk("pit 0x%x.\n", s->bcd); - printk("pit 0x%x.\n", s->gate); - printk("pit %"PRId64"\n", pit->count_load_time[i]); - - } - - pt = &pit->pt0; - printk("pit channel 0 periodic timer:\n", i); - printk("pt %d.\n", pt->enabled); - printk("pt %d.\n", pt->one_shot); - printk("pt %d.\n", pt->irq); - printk("pt %d.\n", pt->first_injected); - printk("pt %d.\n", pt->pending_intr_nr); - printk("pt %d.\n", pt->period); - printk("pt %"PRId64"\n", pt->period_cycles); - printk("pt %"PRId64"\n", pt->last_plt_gtime); -} -#else -static void pit_info(PITState *pit) -{ -} -#endif - static int pit_save(struct domain *d, hvm_domain_context_t *h) { PITState *pit = domain_vpit(d); @@ -452,9 +408,6 @@ static int pit_save(struct domain *d, hvm_domain_context_t *h) spin_lock(&pit->lock); - pit_info(pit); - - /* Save the PIT hardware state */ rc = hvm_save_entry(PIT, 0, h, &pit->hw); spin_unlock(&pit->lock); @@ -469,22 +422,21 @@ static int pit_load(struct domain *d, hvm_domain_context_t *h) spin_lock(&pit->lock); - /* Restore the PIT hardware state */ if ( hvm_load_entry(PIT, h, &pit->hw) ) { spin_unlock(&pit->lock); return 1; } - /* Recreate platform timers from hardware state. There will be some + /* + * Recreate platform timers from hardware state. There will be some * time jitter here, but the wall-clock will have jumped massively, so - * we hope the guest can handle it. */ + * we hope the guest can handle it. + */ pit->pt0.last_plt_gtime = hvm_get_guest_time(d->vcpu[0]); for ( i = 0; i < 3; i++ ) pit_load_count(pit, i, pit->hw.channels[i].count); - pit_info(pit); - spin_unlock(&pit->lock); return 0; @@ -535,7 +487,7 @@ static int handle_pit_io( if ( bytes != 1 ) { gdprintk(XENLOG_WARNING, "PIT bad access\n"); - return 1; + return X86EMUL_OKAY; } if ( dir == IOREQ_WRITE ) @@ -550,7 +502,7 @@ static int handle_pit_io( gdprintk(XENLOG_WARNING, "PIT: read A1:A0=3!\n"); } - return 1; + return X86EMUL_OKAY; } static void speaker_ioport_write( @@ -574,11 +526,7 @@ static int handle_speaker_io( { struct PITState *vpit = vcpu_vpit(current); - if ( bytes != 1 ) - { - gdprintk(XENLOG_WARNING, "PIT_SPEAKER bad access\n"); - return 1; - } + BUG_ON(bytes != 1); spin_lock(&vpit->lock); @@ -589,7 +537,7 @@ static int handle_speaker_io( spin_unlock(&vpit->lock); - return 1; + return X86EMUL_OKAY; } int pv_pit_handler(int port, int data, int write) diff --git a/xen/arch/x86/hvm/intercept.c b/xen/arch/x86/hvm/intercept.c index 04c5da7b6f..0e110e00dc 100644 --- a/xen/arch/x86/hvm/intercept.c +++ b/xen/arch/x86/hvm/intercept.c @@ -45,53 +45,63 @@ static struct hvm_mmio_handler *hvm_mmio_handlers[HVM_MMIO_HANDLER_NR] = &vioapic_mmio_handler }; -static inline void hvm_mmio_access(struct vcpu *v, - ioreq_t *p, - hvm_mmio_read_t read_handler, - hvm_mmio_write_t write_handler) +static int hvm_mmio_access(struct vcpu *v, + ioreq_t *p, + hvm_mmio_read_t read_handler, + hvm_mmio_write_t write_handler) { unsigned long data; + int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1; - switch ( p->type ) + if ( !p->data_is_ptr ) { - case IOREQ_TYPE_COPY: - if ( !p->data_is_ptr ) { - if ( p->dir == IOREQ_READ ) - p->data = read_handler(v, p->addr, p->size); - else /* p->dir == IOREQ_WRITE */ - write_handler(v, p->addr, p->size, p->data); - } else { /* p->data_is_ptr */ - int i, sign = (p->df) ? -1 : 1; - - if ( p->dir == IOREQ_READ ) { - for ( i = 0; i < p->count; i++ ) { - data = read_handler(v, - p->addr + (sign * i * p->size), - p->size); - (void)hvm_copy_to_guest_phys( - p->data + (sign * i * p->size), - &data, - p->size); - } - } else {/* p->dir == IOREQ_WRITE */ - for ( i = 0; i < p->count; i++ ) { - (void)hvm_copy_from_guest_phys( - &data, - p->data + (sign * i * p->size), - p->size); - write_handler(v, - p->addr + (sign * i * p->size), - p->size, data); - } - } + if ( p->dir == IOREQ_READ ) + { + rc = read_handler(v, p->addr, p->size, &data); + p->data = data; } - break; + else /* p->dir == IOREQ_WRITE */ + rc = write_handler(v, p->addr, p->size, p->data); + return rc; + } - default: - printk("hvm_mmio_access: error ioreq type %x\n", p->type); - domain_crash_synchronous(); - break; + if ( p->dir == IOREQ_READ ) + { + for ( i = 0; i < p->count; i++ ) + { + rc = read_handler( + v, + p->addr + (sign * i * p->size), + p->size, &data); + if ( rc != X86EMUL_OKAY ) + break; + (void)hvm_copy_to_guest_phys( + p->data + (sign * i * p->size), + &data, + p->size); + } + } + else + { + for ( i = 0; i < p->count; i++ ) + { + (void)hvm_copy_from_guest_phys( + &data, + p->data + (sign * i * p->size), + p->size); + rc = write_handler( + v, + p->addr + (sign * i * p->size), + p->size, data); + if ( rc != X86EMUL_OKAY ) + break; + } } + + if ( (p->count = i) != 0 ) + rc = X86EMUL_OKAY; + + return rc; } int hvm_mmio_intercept(ioreq_t *p) @@ -100,60 +110,62 @@ int hvm_mmio_intercept(ioreq_t *p) int i; for ( i = 0; i < HVM_MMIO_HANDLER_NR; i++ ) - { if ( hvm_mmio_handlers[i]->check_handler(v, p->addr) ) - { - hvm_mmio_access(v, p, - hvm_mmio_handlers[i]->read_handler, - hvm_mmio_handlers[i]->write_handler); - return 1; - } - } + return hvm_mmio_access( + v, p, + hvm_mmio_handlers[i]->read_handler, + hvm_mmio_handlers[i]->write_handler); - return 0; + return X86EMUL_UNHANDLEABLE; } static int process_portio_intercept(portio_action_t action, ioreq_t *p) { - int rc = 1, i, sign = p->df ? -1 : 1; + int rc = X86EMUL_OKAY, i, sign = p->df ? -1 : 1; uint32_t data; - if ( p->dir == IOREQ_READ ) + if ( !p->data_is_ptr ) { - if ( !p->data_is_ptr ) + if ( p->dir == IOREQ_READ ) { rc = action(IOREQ_READ, p->addr, p->size, &data); p->data = data; } else { - for ( i = 0; i < p->count; i++ ) - { - rc = action(IOREQ_READ, p->addr, p->size, &data); - (void)hvm_copy_to_guest_phys(p->data + sign*i*p->size, - &data, p->size); - } + data = p->data; + rc = action(IOREQ_WRITE, p->addr, p->size, &data); } + return rc; } - else /* p->dir == IOREQ_WRITE */ + + if ( p->dir == IOREQ_READ ) { - if ( !p->data_is_ptr ) + for ( i = 0; i < p->count; i++ ) { - data = p->data; - rc = action(IOREQ_WRITE, p->addr, p->size, &data); + rc = action(IOREQ_READ, p->addr, p->size, &data); + if ( rc != X86EMUL_OKAY ) + break; + (void)hvm_copy_to_guest_phys(p->data + sign*i*p->size, + &data, p->size); } - else + } + else /* p->dir == IOREQ_WRITE */ + { + for ( i = 0; i < p->count; i++ ) { - for ( i = 0; i < p->count; i++ ) - { - data = 0; - (void)hvm_copy_from_guest_phys(&data, p->data + sign*i*p->size, - p->size); - rc = action(IOREQ_WRITE, p->addr, p->size, &data); - } + data = 0; + (void)hvm_copy_from_guest_phys(&data, p->data + sign*i*p->size, + p->size); + rc = action(IOREQ_WRITE, p->addr, p->size, &data); + if ( rc != X86EMUL_OKAY ) + break; } } + if ( (p->count = i) != 0 ) + rc = X86EMUL_OKAY; + return rc; } @@ -170,7 +182,7 @@ int hvm_io_intercept(ioreq_t *p, int type) unsigned long addr, size; if ( (type == HVM_PORTIO) && (dpci_ioport_intercept(p)) ) - return 1; + return X86EMUL_OKAY; for ( i = 0; i < handler->num_slot; i++ ) { @@ -188,10 +200,10 @@ int hvm_io_intercept(ioreq_t *p, int type) } } - return 0; + return X86EMUL_UNHANDLEABLE; } -int register_io_handler( +void register_io_handler( struct domain *d, unsigned long addr, unsigned long size, void *action, int type) { @@ -207,9 +219,8 @@ int register_io_handler( else handler->hdl_list[num].action.mmio = action; handler->num_slot++; - - return 1; } + /* * Local variables: * mode: C diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c index ac1e62782a..6a8e0885c0 100644 --- a/xen/arch/x86/hvm/io.c +++ b/xen/arch/x86/hvm/io.c @@ -148,20 +148,19 @@ void send_timeoffset_req(unsigned long timeoff) void send_invalidate_req(void) { struct vcpu *v = current; - vcpu_iodata_t *vio; + vcpu_iodata_t *vio = get_ioreq(v); ioreq_t *p; - vio = get_ioreq(v); - if ( vio == NULL ) - { - printk("bad shared page: %lx\n", (unsigned long) vio); - domain_crash_synchronous(); - } + BUG_ON(vio == NULL); p = &vio->vp_ioreq; if ( p->state != STATE_IOREQ_NONE ) - printk("WARNING: send invalidate req with something " - "already pending (%d)?\n", p->state); + { + gdprintk(XENLOG_ERR, "WARNING: send invalidate req with something " + "already pending (%d)?\n", p->state); + domain_crash(v->domain); + return; + } p->type = IOREQ_TYPE_INVALIDATE; p->size = 4; @@ -225,12 +224,6 @@ void hvm_io_assist(void) ioreq_t *p = &get_ioreq(curr)->vp_ioreq; enum hvm_io_state io_state; - if ( p->state != STATE_IORESP_READY ) - { - gdprintk(XENLOG_ERR, "Unexpected HVM iorequest state %d.\n", p->state); - domain_crash_synchronous(); - } - rmb(); /* see IORESP_READY /then/ read contents of ioreq */ p->state = STATE_IOREQ_NONE; @@ -253,74 +246,59 @@ void hvm_io_assist(void) void dpci_ioport_read(uint32_t mport, ioreq_t *p) { - uint64_t i; - uint64_t z_data; - uint64_t length = (p->count * p->size); + int i, sign = p->df ? -1 : 1; + uint32_t data = 0; - for ( i = 0; i < length; i += p->size ) + for ( i = 0; i < p->count; i++ ) { - z_data = ~0ULL; - switch ( p->size ) { case 1: - z_data = (uint64_t)inb(mport); + data = inb(mport); break; case 2: - z_data = (uint64_t)inw(mport); + data = inw(mport); break; case 4: - z_data = (uint64_t)inl(mport); + data = inl(mport); break; default: - gdprintk(XENLOG_ERR, "Error: unable to handle size: %" - PRId64 "\n", p->size); - return; + BUG(); } - p->data = z_data; - if ( p->data_is_ptr && - hvm_copy_to_guest_phys(p->data + i, (void *)&z_data, - (int)p->size) ) - { - gdprintk(XENLOG_ERR, "Error: couldn't copy to hvm phys\n"); - return; - } + if ( p->data_is_ptr ) + (void)hvm_copy_to_guest_phys( + p->data + (sign * i * p->size), &data, p->size); + else + p->data = data; } } void dpci_ioport_write(uint32_t mport, ioreq_t *p) { - uint64_t i; - uint64_t z_data = 0; - uint64_t length = (p->count * p->size); + int i, sign = p->df ? -1 : 1; + uint32_t data; - for ( i = 0; i < length; i += p->size ) + for ( i = 0; i < p->count; i++ ) { - z_data = p->data; - if ( p->data_is_ptr && - hvm_copy_from_guest_phys((void *)&z_data, - p->data + i, (int)p->size) ) - { - gdprintk(XENLOG_ERR, "Error: couldn't copy from hvm phys\n"); - return; - } + data = p->data; + if ( p->data_is_ptr ) + (void)hvm_copy_from_guest_phys( + &data, p->data + (sign * i & p->size), p->size); switch ( p->size ) { case 1: - outb((uint8_t) z_data, mport); + outb(data, mport); break; case 2: - outw((uint16_t) z_data, mport); + outw(data, mport); break; case 4: - outl((uint32_t) z_data, mport); + outl(data, mport); break; default: - gdprintk(XENLOG_ERR, "Error: unable to handle size: %" - PRId64 "\n", p->size); - break; + BUG(); } } } diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c index 3bd0dc9d7c..4e50680022 100644 --- a/xen/arch/x86/hvm/mtrr.c +++ b/xen/arch/x86/hvm/mtrr.c @@ -266,7 +266,7 @@ static void setup_var_mtrrs(struct vcpu *v) { if ( e820_table[i].addr == 0x100000 ) { - size = e820_table[i].size + 0x100000 + PAGE_SIZE * 4; + size = e820_table[i].size + 0x100000 + PAGE_SIZE * 5; addr = 0; } else diff --git a/xen/arch/x86/hvm/pmtimer.c b/xen/arch/x86/hvm/pmtimer.c index 8d3fff8f44..4924a80687 100644 --- a/xen/arch/x86/hvm/pmtimer.c +++ b/xen/arch/x86/hvm/pmtimer.c @@ -169,7 +169,7 @@ static int handle_evt_io( spin_unlock(&s->lock); - return 1; + return X86EMUL_OKAY; } @@ -183,7 +183,7 @@ static int handle_pmt_io( if ( bytes != 4 ) { gdprintk(XENLOG_WARNING, "HVM_PMT bad access\n"); - return 1; + return X86EMUL_OKAY; } if ( dir == IOREQ_READ ) @@ -192,10 +192,10 @@ static int handle_pmt_io( pmt_update_time(s); *val = s->pm.tmr_val; spin_unlock(&s->lock); - return 1; + return X86EMUL_OKAY; } - return 0; + return X86EMUL_UNHANDLEABLE; } static int pmtimer_save(struct domain *d, hvm_domain_context_t *h) diff --git a/xen/arch/x86/hvm/rtc.c b/xen/arch/x86/hvm/rtc.c index b9e4b4a241..e196c72866 100644 --- a/xen/arch/x86/hvm/rtc.c +++ b/xen/arch/x86/hvm/rtc.c @@ -403,21 +403,21 @@ static int handle_rtc_io( if ( bytes != 1 ) { gdprintk(XENLOG_WARNING, "HVM_RTC bas access\n"); - return 1; + return X86EMUL_OKAY; } if ( dir == IOREQ_WRITE ) { if ( rtc_ioport_write(vrtc, port, (uint8_t)*val) ) - return 1; + return X86EMUL_OKAY; } else if ( vrtc->hw.cmos_index < RTC_CMOS_SIZE ) { *val = rtc_ioport_read(vrtc, port); - return 1; + return X86EMUL_OKAY; } - return 0; + return X86EMUL_UNHANDLEABLE; } void rtc_migrate_timers(struct vcpu *v) diff --git a/xen/arch/x86/hvm/stdvga.c b/xen/arch/x86/hvm/stdvga.c index 56260c5c77..25b16bddac 100644 --- a/xen/arch/x86/hvm/stdvga.c +++ b/xen/arch/x86/hvm/stdvga.c @@ -32,6 +32,7 @@ #include <xen/sched.h> #include <xen/domain_page.h> #include <asm/hvm/support.h> +#include <xen/numa.h> #define PAT(x) (x) static const uint32_t mask16[16] = { @@ -166,19 +167,19 @@ static void stdvga_out(uint32_t port, uint32_t bytes, uint32_t val) } } -int stdvga_intercept_pio( +static int stdvga_intercept_pio( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct hvm_hw_stdvga *s = ¤t->domain->arch.hvm_domain.stdvga; - if ( dir == IOREQ_READ ) - return 0; - - spin_lock(&s->lock); - stdvga_out(port, bytes, *val); - spin_unlock(&s->lock); + if ( dir == IOREQ_WRITE ) + { + spin_lock(&s->lock); + stdvga_out(port, bytes, *val); + spin_unlock(&s->lock); + } - return 0; /* propagate to external ioemu */ + return X86EMUL_UNHANDLEABLE; /* propagate to external ioemu */ } #define GET_PLANE(data, p) (((data) >> ((p) * 8)) & 0xff) @@ -458,7 +459,7 @@ static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p) return 1; } -int stdvga_intercept_mmio(ioreq_t *p) +static int stdvga_intercept_mmio(ioreq_t *p) { struct domain *d = current->domain; struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga; @@ -467,7 +468,7 @@ int stdvga_intercept_mmio(ioreq_t *p) if ( p->size > 8 ) { gdprintk(XENLOG_WARNING, "invalid mmio size %d\n", (int)p->size); - return 0; + return X86EMUL_UNHANDLEABLE; } spin_lock(&s->lock); @@ -498,7 +499,7 @@ int stdvga_intercept_mmio(ioreq_t *p) spin_unlock(&s->lock); - return rc; + return rc ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; } void stdvga_init(struct domain *d) @@ -513,7 +514,8 @@ void stdvga_init(struct domain *d) for ( i = 0; i != ARRAY_SIZE(s->vram_page); i++ ) { - if ( (pg = alloc_domheap_page(NULL)) == NULL ) + pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); + if ( pg == NULL ) break; s->vram_page[i] = pg; p = map_domain_page(page_to_mfn(pg)); diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index be166a868c..7c10127966 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -255,11 +255,6 @@ static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c) svm_update_guest_cr(v, 2); svm_update_guest_cr(v, 4); -#ifdef HVM_DEBUG_SUSPEND - printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n", - __func__, c->cr3, c->cr0, c->cr4); -#endif - vmcb->sysenter_cs = c->sysenter_cs; vmcb->sysenter_esp = c->sysenter_esp; vmcb->sysenter_eip = c->sysenter_eip; @@ -472,7 +467,7 @@ static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg, { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - ASSERT(v == current); + ASSERT((v == current) || !vcpu_runnable(v)); switch ( seg ) { diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c index c01618c69f..8ebaa260cf 100644 --- a/xen/arch/x86/hvm/vioapic.c +++ b/xen/arch/x86/hvm/vioapic.c @@ -88,9 +88,9 @@ static unsigned long vioapic_read_indirect(struct hvm_hw_vioapic *vioapic, return result; } -static unsigned long vioapic_read(struct vcpu *v, - unsigned long addr, - unsigned long length) +static int vioapic_read( + struct vcpu *v, unsigned long addr, + unsigned long length, unsigned long *pval) { struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain); uint32_t result; @@ -114,11 +114,13 @@ static unsigned long vioapic_read(struct vcpu *v, break; } - return result; + *pval = result; + return X86EMUL_OKAY; } static void vioapic_write_redirent( - struct hvm_hw_vioapic *vioapic, unsigned int idx, int top_word, uint32_t val) + struct hvm_hw_vioapic *vioapic, unsigned int idx, + int top_word, uint32_t val) { struct domain *d = vioapic_domain(vioapic); struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; @@ -196,10 +198,9 @@ static void vioapic_write_indirect( } } -static void vioapic_write(struct vcpu *v, - unsigned long addr, - unsigned long length, - unsigned long val) +static int vioapic_write( + struct vcpu *v, unsigned long addr, + unsigned long length, unsigned long val) { struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain); @@ -224,6 +225,8 @@ static void vioapic_write(struct vcpu *v, default: break; } + + return X86EMUL_OKAY; } static int vioapic_range(struct vcpu *v, unsigned long addr) @@ -477,45 +480,16 @@ void vioapic_update_EOI(struct domain *d, int vector) spin_unlock(&d->arch.hvm_domain.irq_lock); } -#ifdef HVM_DEBUG_SUSPEND -static void ioapic_info(struct hvm_hw_vioapic *s) -{ - int i; - printk("*****ioapic state:*****\n"); - printk("ioapic 0x%x.\n", s->ioregsel); - printk("ioapic 0x%x.\n", s->id); - printk("ioapic 0x%lx.\n", s->base_address); - for (i = 0; i < VIOAPIC_NUM_PINS; i++) { - printk("ioapic redirtbl[%d]:0x%"PRIx64"\n", i, s->redirtbl[i].bits); - } - -} -#else -static void ioapic_info(struct hvm_hw_vioapic *s) -{ -} -#endif - - static int ioapic_save(struct domain *d, hvm_domain_context_t *h) { struct hvm_hw_vioapic *s = domain_vioapic(d); - ioapic_info(s); - - /* save io-apic state*/ - return ( hvm_save_entry(IOAPIC, 0, h, s) ); + return hvm_save_entry(IOAPIC, 0, h, s); } static int ioapic_load(struct domain *d, hvm_domain_context_t *h) { struct hvm_hw_vioapic *s = domain_vioapic(d); - - /* restore ioapic state */ - if ( hvm_load_entry(IOAPIC, h, s) != 0 ) - return -EINVAL; - - ioapic_info(s); - return 0; + return hvm_load_entry(IOAPIC, h, s); } HVM_REGISTER_SAVE_RESTORE(IOAPIC, ioapic_save, ioapic_load, 1, HVMSR_PER_DOM); diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c index bf53ba7a1a..9bfc2cc3d1 100644 --- a/xen/arch/x86/hvm/vlapic.c +++ b/xen/arch/x86/hvm/vlapic.c @@ -33,6 +33,7 @@ #include <xen/sched.h> #include <asm/current.h> #include <asm/hvm/vmx/vmx.h> +#include <xen/numa.h> #include <public/hvm/ioreq.h> #include <public/hvm/params.h> @@ -240,12 +241,145 @@ static int vlapic_match_dest(struct vcpu *v, struct vlapic *source, return result; } +static int vlapic_vcpu_pause_async(struct vcpu *v) +{ + vcpu_pause_nosync(v); + + if ( v->is_running ) + { + vcpu_unpause(v); + return 0; + } + + sync_vcpu_execstate(v); + return 1; +} + +static void vlapic_init_action(unsigned long _vcpu) +{ + struct vcpu *v = (struct vcpu *)_vcpu; + struct domain *d = v->domain; + + /* If the VCPU is not on its way down we have nothing to do. */ + if ( !test_bit(_VPF_down, &v->pause_flags) ) + return; + + if ( !vlapic_vcpu_pause_async(v) ) + { + tasklet_schedule(&vcpu_vlapic(v)->init_tasklet); + return; + } + + domain_lock(d); + + /* Paranoia makes us re-assert VPF_down under the domain lock. */ + set_bit(_VPF_down, &v->pause_flags); + v->is_initialised = 0; + clear_bit(_VPF_blocked, &v->pause_flags); + + vlapic_reset(vcpu_vlapic(v)); + + domain_unlock(d); + + vcpu_unpause(v); +} + +static int vlapic_accept_init(struct vcpu *v) +{ + /* Nothing to do if the VCPU is already reset. */ + if ( !v->is_initialised ) + return X86EMUL_OKAY; + + /* Asynchronously take the VCPU down and schedule reset work. */ + hvm_vcpu_down(v); + tasklet_schedule(&vcpu_vlapic(v)->init_tasklet); + return X86EMUL_RETRY; +} + +static int vlapic_accept_sipi(struct vcpu *v, int trampoline_vector) +{ + struct domain *d = current->domain; + struct vcpu_guest_context *ctxt; + struct segment_register reg; + + /* If the VCPU is not on its way down we have nothing to do. */ + if ( !test_bit(_VPF_down, &v->pause_flags) ) + return X86EMUL_OKAY; + + if ( !vlapic_vcpu_pause_async(v) ) + return X86EMUL_RETRY; + + domain_lock(d); + + if ( v->is_initialised ) + goto out; + + ctxt = &v->arch.guest_context; + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->flags = VGCF_online; + ctxt->user_regs.eflags = 2; + + v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET; + hvm_update_guest_cr(v, 0); + + v->arch.hvm_vcpu.guest_cr[2] = 0; + hvm_update_guest_cr(v, 2); + + v->arch.hvm_vcpu.guest_cr[3] = 0; + hvm_update_guest_cr(v, 3); + + v->arch.hvm_vcpu.guest_cr[4] = 0; + hvm_update_guest_cr(v, 4); + + v->arch.hvm_vcpu.guest_efer = 0; + hvm_update_guest_efer(v); + + reg.sel = trampoline_vector << 8; + reg.base = (uint32_t)reg.sel << 4; + reg.limit = 0xffff; + reg.attr.bytes = 0x89b; + hvm_set_segment_register(v, x86_seg_cs, ®); + + reg.sel = reg.base = 0; + reg.limit = 0xffff; + reg.attr.bytes = 0x893; + hvm_set_segment_register(v, x86_seg_ds, ®); + hvm_set_segment_register(v, x86_seg_es, ®); + hvm_set_segment_register(v, x86_seg_fs, ®); + hvm_set_segment_register(v, x86_seg_gs, ®); + hvm_set_segment_register(v, x86_seg_ss, ®); + + reg.attr.bytes = 0x82; /* LDT */ + hvm_set_segment_register(v, x86_seg_ldtr, ®); + + reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */ + hvm_set_segment_register(v, x86_seg_tr, ®); + + reg.attr.bytes = 0; + hvm_set_segment_register(v, x86_seg_gdtr, ®); + hvm_set_segment_register(v, x86_seg_idtr, ®); + + /* Sync AP's TSC with BSP's. */ + v->arch.hvm_vcpu.cache_tsc_offset = + v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset; + hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); + + v->arch.flags |= TF_kernel_mode; + v->is_initialised = 1; + clear_bit(_VPF_down, &v->pause_flags); + + out: + domain_unlock(d); + vcpu_unpause(v); + return X86EMUL_OKAY; +} + /* Add a pending IRQ into lapic. */ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode, int vector, int level, int trig_mode) { - int result = 0; struct vlapic *vlapic = vcpu_vlapic(v); + int rc = X86EMUL_OKAY; switch ( delivery_mode ) { @@ -270,8 +404,6 @@ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode, } vcpu_kick(v); - - result = 1; break; case APIC_DM_REMRD: @@ -291,43 +423,20 @@ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode, /* No work on INIT de-assert for P4-type APIC. */ if ( trig_mode && !(level & APIC_INT_ASSERT) ) break; - /* FIXME How to check the situation after vcpu reset? */ - if ( v->is_initialised ) - hvm_vcpu_reset(v); - v->arch.hvm_vcpu.init_sipi_sipi_state = - HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI; - result = 1; + rc = vlapic_accept_init(v); break; case APIC_DM_STARTUP: - if ( v->arch.hvm_vcpu.init_sipi_sipi_state == - HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM ) - break; - - v->arch.hvm_vcpu.init_sipi_sipi_state = - HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM; - - if ( v->is_initialised ) - { - gdprintk(XENLOG_ERR, "SIPI for initialized vcpu %x\n", v->vcpu_id); - goto exit_and_crash; - } - - if ( hvm_bringup_ap(v->vcpu_id, vector) != 0 ) - result = 0; + rc = vlapic_accept_sipi(v, vector); break; default: gdprintk(XENLOG_ERR, "TODO: unsupported delivery mode %x\n", delivery_mode); - goto exit_and_crash; + domain_crash(v->domain); } - return result; - - exit_and_crash: - domain_crash(v->domain); - return 0; + return rc; } /* This function is used by both ioapic and lapic.The bitmap is for vcpu_id. */ @@ -369,11 +478,9 @@ void vlapic_EOI_set(struct vlapic *vlapic) vioapic_update_EOI(vlapic_domain(vlapic), vector); } -static void vlapic_ipi(struct vlapic *vlapic) +static int vlapic_ipi( + struct vlapic *vlapic, uint32_t icr_low, uint32_t icr_high) { - uint32_t icr_low = vlapic_get_reg(vlapic, APIC_ICR); - uint32_t icr_high = vlapic_get_reg(vlapic, APIC_ICR2); - unsigned int dest = GET_APIC_DEST_FIELD(icr_high); unsigned int short_hand = icr_low & APIC_SHORT_MASK; unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; @@ -385,6 +492,7 @@ static void vlapic_ipi(struct vlapic *vlapic) struct vlapic *target; struct vcpu *v; uint32_t lpr_map = 0; + int rc = X86EMUL_OKAY; HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " @@ -399,18 +507,23 @@ static void vlapic_ipi(struct vlapic *vlapic) if ( delivery_mode == APIC_DM_LOWEST ) __set_bit(v->vcpu_id, &lpr_map); else - vlapic_accept_irq(v, delivery_mode, - vector, level, trig_mode); + rc = vlapic_accept_irq(v, delivery_mode, + vector, level, trig_mode); } + + if ( rc != X86EMUL_OKAY ) + break; } if ( delivery_mode == APIC_DM_LOWEST ) { target = apic_round_robin(vlapic_domain(v), vector, lpr_map); if ( target != NULL ) - vlapic_accept_irq(vlapic_vcpu(target), delivery_mode, - vector, level, trig_mode); + rc = vlapic_accept_irq(vlapic_vcpu(target), delivery_mode, + vector, level, trig_mode); } + + return rc; } static uint32_t vlapic_get_tmcct(struct vlapic *vlapic) @@ -465,17 +578,18 @@ static void vlapic_read_aligned( } } -static unsigned long vlapic_read(struct vcpu *v, unsigned long address, - unsigned long len) +static int vlapic_read( + struct vcpu *v, unsigned long address, + unsigned long len, unsigned long *pval) { unsigned int alignment; unsigned int tmp; - unsigned long result; + unsigned long result = 0; struct vlapic *vlapic = vcpu_vlapic(v); unsigned int offset = address - vlapic_base_address(vlapic); if ( offset > (APIC_TDCR + 0x3) ) - return 0; + goto out; alignment = offset & 0x3; @@ -507,14 +621,16 @@ static unsigned long vlapic_read(struct vcpu *v, unsigned long address, HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset 0x%x with length 0x%lx, " "and the result is 0x%lx", offset, len, result); - return result; + out: + *pval = result; + return X86EMUL_OKAY; unaligned_exit_and_crash: gdprintk(XENLOG_ERR, "Unaligned LAPIC read len=0x%lx at offset=0x%x.\n", len, offset); exit_and_crash: domain_crash(v->domain); - return 0; + return X86EMUL_OKAY; } void vlapic_pt_cb(struct vcpu *v, void *data) @@ -522,11 +638,12 @@ void vlapic_pt_cb(struct vcpu *v, void *data) *(s_time_t *)data = hvm_get_guest_time(v); } -static void vlapic_write(struct vcpu *v, unsigned long address, - unsigned long len, unsigned long val) +static int vlapic_write(struct vcpu *v, unsigned long address, + unsigned long len, unsigned long val) { struct vlapic *vlapic = vcpu_vlapic(v); unsigned int offset = address - vlapic_base_address(vlapic); + int rc = X86EMUL_OKAY; if ( offset != 0xb0 ) HVM_DBG_LOG(DBG_LEVEL_VLAPIC, @@ -540,13 +657,13 @@ static void vlapic_write(struct vcpu *v, unsigned long address, val = (uint32_t)val; if ( len != 4 ) { - unsigned int tmp; + unsigned long tmp; unsigned char alignment; gdprintk(XENLOG_INFO, "Notice: Local APIC write with len = %lx\n",len); alignment = offset & 0x3; - tmp = vlapic_read(v, offset & ~0x3, 4); + (void)vlapic_read(v, offset & ~0x3, 4, &tmp); switch ( len ) { @@ -617,9 +734,10 @@ static void vlapic_write(struct vcpu *v, unsigned long address, break; case APIC_ICR: - /* No delay here, so we always clear the pending bit*/ - vlapic_set_reg(vlapic, APIC_ICR, val & ~(1 << 12)); - vlapic_ipi(vlapic); + val &= ~(1 << 12); /* always clear the pending bit */ + rc = vlapic_ipi(vlapic, val, vlapic_get_reg(vlapic, APIC_ICR2)); + if ( rc == X86EMUL_OKAY ) + vlapic_set_reg(vlapic, APIC_ICR, val); break; case APIC_ICR2: @@ -669,13 +787,14 @@ static void vlapic_write(struct vcpu *v, unsigned long address, break; } - return; + return rc; unaligned_exit_and_crash: gdprintk(XENLOG_ERR, "Unaligned LAPIC write len=0x%lx at offset=0x%x.\n", len, offset); exit_and_crash: domain_crash(v->domain); + return rc; } static int vlapic_range(struct vcpu *v, unsigned long addr) @@ -788,77 +907,58 @@ void vlapic_reset(struct vlapic *vlapic) vlapic_set_reg(vlapic, APIC_SPIV, 0xff); vlapic->hw.disabled |= VLAPIC_SW_DISABLED; -} -#ifdef HVM_DEBUG_SUSPEND -static void lapic_info(struct vlapic *s) -{ - printk("*****lapic state:*****\n"); - printk("lapic 0x%"PRIx64".\n", s->hw.apic_base_msr); - printk("lapic 0x%x.\n", s->hw.disabled); - printk("lapic 0x%x.\n", s->hw.timer_divisor); -} -#else -static void lapic_info(struct vlapic *s) -{ + destroy_periodic_time(&vlapic->pt); } -#endif /* rearm the actimer if needed, after a HVM restore */ static void lapic_rearm(struct vlapic *s) { - unsigned long tmict; + unsigned long tmict = vlapic_get_reg(s, APIC_TMICT); + uint64_t period; - tmict = vlapic_get_reg(s, APIC_TMICT); - if ( tmict > 0 ) - { - uint64_t period = (uint64_t)APIC_BUS_CYCLE_NS * - (uint32_t)tmict * s->hw.timer_divisor; - uint32_t lvtt = vlapic_get_reg(s, APIC_LVTT); - - s->pt.irq = lvtt & APIC_VECTOR_MASK; - create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq, - !vlapic_lvtt_period(s), vlapic_pt_cb, - &s->timer_last_update); - s->timer_last_update = s->pt.last_plt_gtime; - - printk("lapic_load to rearm the actimer:" - "bus cycle is %uns, " - "saved tmict count %lu, period %"PRIu64"ns, irq=%"PRIu8"\n", - APIC_BUS_CYCLE_NS, tmict, period, s->pt.irq); - } + if ( (tmict = vlapic_get_reg(s, APIC_TMICT)) == 0 ) + return; - lapic_info(s); + period = ((uint64_t)APIC_BUS_CYCLE_NS * + (uint32_t)tmict * s->hw.timer_divisor); + s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK; + create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq, + !vlapic_lvtt_period(s), vlapic_pt_cb, + &s->timer_last_update); + s->timer_last_update = s->pt.last_plt_gtime; } static int lapic_save_hidden(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct vlapic *s; + int rc = 0; - for_each_vcpu(d, v) + for_each_vcpu ( d, v ) { s = vcpu_vlapic(v); - lapic_info(s); - - if ( hvm_save_entry(LAPIC, v->vcpu_id, h, &s->hw) != 0 ) - return 1; + if ( (rc = hvm_save_entry(LAPIC, v->vcpu_id, h, &s->hw)) != 0 ) + break; } - return 0; + + return rc; } static int lapic_save_regs(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct vlapic *s; + int rc = 0; - for_each_vcpu(d, v) + for_each_vcpu ( d, v ) { s = vcpu_vlapic(v); - if ( hvm_save_entry(LAPIC_REGS, v->vcpu_id, h, s->regs) != 0 ) - return 1; + if ( (rc = hvm_save_entry(LAPIC_REGS, v->vcpu_id, h, s->regs)) != 0 ) + break; } - return 0; + + return rc; } static int lapic_load_hidden(struct domain *d, hvm_domain_context_t *h) @@ -879,8 +979,6 @@ static int lapic_load_hidden(struct domain *d, hvm_domain_context_t *h) if ( hvm_load_entry(LAPIC, h, &s->hw) != 0 ) return -EINVAL; - lapic_info(s); - vmx_vlapic_msr_changed(v); return 0; @@ -916,7 +1014,7 @@ HVM_REGISTER_SAVE_RESTORE(LAPIC_REGS, lapic_save_regs, lapic_load_regs, int vlapic_init(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); - unsigned int memflags = 0; + unsigned int memflags = MEMF_node(vcpu_to_node(v)); HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "%d", v->vcpu_id); @@ -925,10 +1023,10 @@ int vlapic_init(struct vcpu *v) #ifdef __i386__ /* 32-bit VMX may be limited to 32-bit physical addresses. */ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) - memflags = MEMF_bits(32); + memflags |= MEMF_bits(32); #endif - vlapic->regs_page = alloc_domheap_pages(NULL, 0, memflags); + vlapic->regs_page = alloc_domheap_page(NULL, memflags); if ( vlapic->regs_page == NULL ) { dprintk(XENLOG_ERR, "alloc vlapic regs error: %d/%d\n", @@ -941,7 +1039,7 @@ int vlapic_init(struct vcpu *v) { dprintk(XENLOG_ERR, "map vlapic regs error: %d/%d\n", v->domain->domain_id, v->vcpu_id); - return -ENOMEM; + return -ENOMEM; } clear_page(vlapic->regs); @@ -953,6 +1051,8 @@ int vlapic_init(struct vcpu *v) if ( v->vcpu_id == 0 ) vlapic->hw.apic_base_msr |= MSR_IA32_APICBASE_BSP; + tasklet_init(&vlapic->init_tasklet, vlapic_init_action, (unsigned long)v); + return 0; } @@ -960,6 +1060,7 @@ void vlapic_destroy(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); + tasklet_kill(&vlapic->init_tasklet); destroy_periodic_time(&vlapic->pt); unmap_domain_page_global(vlapic->regs); free_domheap_page(vlapic->regs_page); diff --git a/xen/arch/x86/hvm/vmx/realmode.c b/xen/arch/x86/hvm/vmx/realmode.c index c00e8b1e42..5d13f4e60b 100644 --- a/xen/arch/x86/hvm/vmx/realmode.c +++ b/xen/arch/x86/hvm/vmx/realmode.c @@ -172,7 +172,7 @@ static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) hvmemul_ctxt->insn_buf[0], hvmemul_ctxt->insn_buf[1], hvmemul_ctxt->insn_buf[2], hvmemul_ctxt->insn_buf[3], hvmemul_ctxt->insn_buf[4], hvmemul_ctxt->insn_buf[5]); - domain_crash_synchronous(); + domain_crash(curr->domain); } void vmx_realmode(struct cpu_user_regs *regs) diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c index bee9eb1deb..48506c5b32 100644 --- a/xen/arch/x86/hvm/vmx/vmcs.c +++ b/xen/arch/x86/hvm/vmx/vmcs.c @@ -38,6 +38,9 @@ #include <asm/shadow.h> #include <asm/tboot.h> +static int opt_vpid_enabled = 1; +boolean_param("vpid", opt_vpid_enabled); + /* Dynamic (run-time adjusted) execution control flags. */ u32 vmx_pin_based_exec_control __read_mostly; u32 vmx_cpu_based_exec_control __read_mostly; @@ -84,14 +87,16 @@ static void vmx_init_vmcs_config(void) min = (CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_ACTIVATE_IO_BITMAP | CPU_BASED_USE_TSC_OFFSETING); - opt = CPU_BASED_ACTIVATE_MSR_BITMAP; - opt |= CPU_BASED_TPR_SHADOW; - opt |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + opt = (CPU_BASED_ACTIVATE_MSR_BITMAP | + CPU_BASED_TPR_SHADOW | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); _vmx_cpu_based_exec_control = adjust_vmx_controls( min, opt, MSR_IA32_VMX_PROCBASED_CTLS); #ifdef __x86_64__ @@ -107,11 +112,25 @@ static void vmx_init_vmcs_config(void) { min = 0; opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING); + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_ENABLE_EPT); + if ( opt_vpid_enabled ) + opt |= SECONDARY_EXEC_ENABLE_VPID; _vmx_secondary_exec_control = adjust_vmx_controls( min, opt, MSR_IA32_VMX_PROCBASED_CTLS2); } + if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT ) + { + /* To use EPT we expect to be able to clear certain intercepts. */ + uint32_t must_be_one, must_be_zero; + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, must_be_one, must_be_zero); + if ( must_be_one & (CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING) ) + _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + } + #if defined(__i386__) /* If we can't virtualise APIC accesses, the TPR shadow is pointless. */ if ( !(_vmx_secondary_exec_control & @@ -301,6 +320,10 @@ int vmx_cpu_up(void) return 0; } + ept_sync_all(); + + vpid_sync_all(); + return 1; } @@ -439,6 +462,7 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr) static int construct_vmcs(struct vcpu *v) { + struct domain *d = v->domain; uint16_t sysenter_cs; unsigned long sysenter_eip; @@ -448,10 +472,25 @@ static int construct_vmcs(struct vcpu *v) __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control); __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control); - __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control); + v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; - if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS ) - __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control); + v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control; + + if ( paging_mode_hap(d) ) + { + v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + } + else + { + v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + } + + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); + if ( cpu_has_vmx_secondary_exec_control ) + __vmwrite(SECONDARY_VM_EXEC_CONTROL, + v->arch.hvm_vmx.secondary_exec_control); /* MSR access bitmap. */ if ( cpu_has_vmx_msr_bitmap ) @@ -570,9 +609,10 @@ static int construct_vmcs(struct vcpu *v) __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL); #endif - __vmwrite(EXCEPTION_BITMAP, (HVM_TRAP_MASK | - (1U << TRAP_page_fault) | - (1U << TRAP_no_device))); + __vmwrite(EXCEPTION_BITMAP, + HVM_TRAP_MASK + | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault)) + | (1U << TRAP_no_device)); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; hvm_update_guest_cr(v, 0); @@ -587,6 +627,22 @@ static int construct_vmcs(struct vcpu *v) __vmwrite(TPR_THRESHOLD, 0); } + if ( paging_mode_hap(d) ) + { + __vmwrite(EPT_POINTER, d->arch.hvm_domain.vmx.ept_control.eptp); +#ifdef CONFIG_X86_PAE + __vmwrite(EPT_POINTER_HIGH, + d->arch.hvm_domain.vmx.ept_control.eptp >> 32); +#endif + } + + if ( cpu_has_vmx_vpid ) + { + v->arch.hvm_vmx.vpid = + v->domain->arch.hvm_domain.vmx.vpid_base + v->vcpu_id; + __vmwrite(VIRTUAL_PROCESSOR_ID, v->arch.hvm_vmx.vpid); + } + vmx_vmcs_exit(v); paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */ @@ -729,14 +785,14 @@ void vmx_destroy_vmcs(struct vcpu *v) arch_vmx->vmcs = NULL; } -void vm_launch_fail(unsigned long eflags) +void vm_launch_fail(void) { unsigned long error = __vmread(VM_INSTRUCTION_ERROR); printk("<vm_launch_fail> error code %lx\n", error); domain_crash_synchronous(); } -void vm_resume_fail(unsigned long eflags) +void vm_resume_fail(void) { unsigned long error = __vmread(VM_INSTRUCTION_ERROR); printk("<vm_resume_fail> error code %lx\n", error); @@ -780,6 +836,7 @@ void vmx_do_resume(struct vcpu *v) vmx_load_vmcs(v); hvm_migrate_timers(v); vmx_set_host_env(v); + vpid_sync_vcpu_all(v); } debug_state = v->domain->debugger_attached; @@ -932,6 +989,10 @@ void vmcs_dump_vcpu(struct vcpu *v) (uint32_t)vmr(IDT_VECTORING_ERROR_CODE)); printk("TPR Threshold = 0x%02x\n", (uint32_t)vmr(TPR_THRESHOLD)); + printk("EPT pointer = 0x%08x%08x\n", + (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER)); + printk("Virtual processor ID = 0x%04x\n", + (uint32_t)vmr(VIRTUAL_PROCESSOR_ID)); vmx_vmcs_exit(v); } diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 29dcb68503..628cbddfcf 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -57,6 +57,8 @@ static void vmx_ctxt_switch_to(struct vcpu *v); static int vmx_alloc_vlapic_mapping(struct domain *d); static void vmx_free_vlapic_mapping(struct domain *d); +static int vmx_alloc_vpid(struct domain *d); +static void vmx_free_vpid(struct domain *d); static void vmx_install_vlapic_mapping(struct vcpu *v); static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr); static void vmx_update_guest_efer(struct vcpu *v); @@ -71,12 +73,30 @@ static void vmx_invlpg_intercept(unsigned long vaddr); static int vmx_domain_initialise(struct domain *d) { - return vmx_alloc_vlapic_mapping(d); + int rc; + + d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT; + d->arch.hvm_domain.vmx.ept_control.gaw = EPT_DEFAULT_GAW; + d->arch.hvm_domain.vmx.ept_control.asr = + pagetable_get_pfn(d->arch.phys_table); + + if ( (rc = vmx_alloc_vpid(d)) != 0 ) + return rc; + + if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) + { + vmx_free_vpid(d); + return rc; + } + + return 0; } static void vmx_domain_destroy(struct domain *d) { + ept_sync_domain(d); vmx_free_vlapic_mapping(d); + vmx_free_vpid(d); } static int vmx_vcpu_initialise(struct vcpu *v) @@ -492,20 +512,23 @@ static int vmx_restore_cr0_cr3( unsigned long mfn = 0; p2m_type_t p2mt; - if ( cr0 & X86_CR0_PG ) + if ( paging_mode_shadow(v->domain) ) { - mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt)); - if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) ) + if ( cr0 & X86_CR0_PG ) { - gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3); - return -EINVAL; + mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt)); + if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) ) + { + gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3); + return -EINVAL; + } } - } - if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG ) - put_page(pagetable_get_page(v->arch.guest_table)); + if ( hvm_paging_enabled(v) ) + put_page(pagetable_get_page(v->arch.guest_table)); - v->arch.guest_table = pagetable_from_pfn(mfn); + v->arch.guest_table = pagetable_from_pfn(mfn); + } v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET; v->arch.hvm_vcpu.guest_cr[3] = cr3; @@ -538,11 +561,6 @@ static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c) vmx_update_guest_cr(v, 2); vmx_update_guest_cr(v, 4); -#ifdef HVM_DEBUG_SUSPEND - printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n", - __func__, c->cr3, c->cr0, c->cr4); -#endif - v->arch.hvm_vcpu.guest_efer = c->msr_efer; vmx_update_guest_efer(v); @@ -573,20 +591,6 @@ static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c) return 0; } -#if defined(__x86_64__) && defined(HVM_DEBUG_SUSPEND) -static void dump_msr_state(struct vmx_msr_state *m) -{ - int i = 0; - printk("**** msr state ****\n"); - printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags); - for ( i = 0; i < VMX_MSR_COUNT; i++ ) - printk("0x%lx,", m->msrs[i]); - printk("\n"); -} -#else -#define dump_msr_state(m) ((void)0) -#endif - static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { #ifdef __x86_64__ @@ -604,8 +608,6 @@ static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) #endif data->tsc = hvm_get_guest_time(v); - - dump_msr_state(guest_state); } static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) @@ -624,8 +626,6 @@ static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) #endif hvm_set_guest_time(v, data->tsc); - - dump_msr_state(guest_state); } @@ -900,6 +900,56 @@ static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow) __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow); } +static void vmx_load_pdptrs(struct vcpu *v) +{ + unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn; + uint64_t *guest_pdptrs; + p2m_type_t p2mt; + char *p; + + /* EPT needs to load PDPTRS into VMCS for PAE. */ + if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) ) + return; + + if ( cr3 & 0x1fUL ) + goto crash; + + mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt)); + if ( !p2m_is_ram(p2mt) ) + goto crash; + + p = map_domain_page(mfn); + + guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK)); + + /* + * We do not check the PDPTRs for validity. The CPU will do this during + * vm entry, and we can handle the failure there and crash the guest. + * The only thing we could do better here is #GP instead. + */ + + vmx_vmcs_enter(v); + + __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]); + __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]); + __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]); + __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]); +#ifdef CONFIG_X86_PAE + __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32); + __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32); + __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32); + __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32); +#endif + + vmx_vmcs_exit(v); + + unmap_domain_page(p); + return; + + crash: + domain_crash(v->domain); +} + static void vmx_update_host_cr3(struct vcpu *v) { vmx_vmcs_enter(v); @@ -915,7 +965,24 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) { case 0: { unsigned long hw_cr0_mask = - X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE; + X86_CR0_NE | X86_CR0_PG | X86_CR0_PE; + + if ( paging_mode_shadow(v->domain) ) + hw_cr0_mask |= X86_CR0_WP; + + if ( paging_mode_hap(v->domain) ) + { + /* We manage GUEST_CR3 when guest CR0.PE is zero. */ + uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING); + v->arch.hvm_vmx.exec_control &= ~cr3_ctls; + if ( !hvm_paging_enabled(v) ) + v->arch.hvm_vmx.exec_control |= cr3_ctls; + __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); + + /* Changing CR0.PE can change some bits in real CR4. */ + vmx_update_guest_cr(v, 4); + } if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { @@ -939,11 +1006,27 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) /* CR2 is updated in exit stub. */ break; case 3: + if ( paging_mode_hap(v->domain) ) + { + if ( !hvm_paging_enabled(v) ) + v->arch.hvm_vcpu.hw_cr[3] = + v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT]; + vmx_load_pdptrs(v); + } + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]); + vpid_sync_vcpu_all(v); break; case 4: - v->arch.hvm_vcpu.hw_cr[4] = - v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK; + v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK; + if ( paging_mode_hap(v->domain) ) + v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; + v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4]; + if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) ) + { + v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE; + v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; + } __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]); break; @@ -978,12 +1061,29 @@ static void vmx_update_guest_efer(struct vcpu *v) static void vmx_flush_guest_tlbs(void) { - /* No tagged TLB support on VMX yet. The fact that we're in Xen - * at all means any guest will have a clean TLB when it's next run, - * because VMRESUME will flush it for us. */ + /* + * If VPID (i.e. tagged TLB support) is not enabled, the fact that + * we're in Xen at all means any guest will have a clean TLB when + * it's next run, because VMRESUME will flush it for us. + * + * If enabled, we invalidate all translations associated with all + * VPID values. + */ + vpid_sync_all(); } +static void __ept_sync_domain(void *info) +{ + struct domain *d = info; + __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0); +} +void ept_sync_domain(struct domain *d) +{ + /* Only if using EPT and this domain has some VCPUs to dirty. */ + if ( d->arch.hvm_domain.hap_enabled && d->vcpu[0] ) + on_each_cpu(__ept_sync_domain, d, 1, 1); +} static void __vmx_inject_exception( struct vcpu *v, int trap, int type, int error_code) @@ -1100,6 +1200,9 @@ static struct hvm_function_table vmx_function_table = { .invlpg_intercept = vmx_invlpg_intercept }; +static unsigned long *vpid_bitmap; +#define VPID_BITMAP_SIZE ((1u << VMCS_VPID_WIDTH) / MAX_VIRT_CPUS) + void start_vmx(void) { static int bootstrapped; @@ -1133,6 +1236,25 @@ void start_vmx(void) return; } + if ( cpu_has_vmx_ept ) + { + printk("VMX: EPT is available.\n"); + vmx_function_table.hap_supported = 1; + } + + if ( cpu_has_vmx_vpid ) + { + printk("VMX: VPID is available.\n"); + + vpid_bitmap = xmalloc_array( + unsigned long, BITS_TO_LONGS(VPID_BITMAP_SIZE)); + BUG_ON(vpid_bitmap == NULL); + memset(vpid_bitmap, 0, BITS_TO_LONGS(VPID_BITMAP_SIZE) * sizeof(long)); + + /* VPID 0 is used by VMX root mode (the hypervisor). */ + __set_bit(0, vpid_bitmap); + } + setup_vmcs_dump(); hvm_enable(&vmx_function_table); @@ -1635,18 +1757,47 @@ static int vmx_alloc_vlapic_mapping(struct domain *d) share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable); set_mmio_p2m_entry( d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va))); - d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va); + d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va); return 0; } static void vmx_free_vlapic_mapping(struct domain *d) { - unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn; + unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn; if ( mfn != 0 ) free_xenheap_page(mfn_to_virt(mfn)); } +static int vmx_alloc_vpid(struct domain *d) +{ + int idx; + + if ( !cpu_has_vmx_vpid ) + return 0; + + do { + idx = find_first_zero_bit(vpid_bitmap, VPID_BITMAP_SIZE); + if ( idx >= VPID_BITMAP_SIZE ) + { + dprintk(XENLOG_WARNING, "VMX VPID space exhausted.\n"); + return -EBUSY; + } + } + while ( test_and_set_bit(idx, vpid_bitmap) ); + + d->arch.hvm_domain.vmx.vpid_base = idx * MAX_VIRT_CPUS; + return 0; +} + +static void vmx_free_vpid(struct domain *d) +{ + if ( !cpu_has_vmx_vpid ) + return; + + clear_bit(d->arch.hvm_domain.vmx.vpid_base / MAX_VIRT_CPUS, vpid_bitmap); +} + static void vmx_install_vlapic_mapping(struct vcpu *v) { paddr_t virt_page_ma, apic_page_ma; @@ -1655,7 +1806,7 @@ static void vmx_install_vlapic_mapping(struct vcpu *v) return; virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page); - apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn; + apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn; apic_page_ma <<= PAGE_SHIFT; vmx_vmcs_enter(v); @@ -1900,6 +2051,51 @@ static void vmx_wbinvd_intercept(void) wbinvd(); } +static void ept_handle_violation(unsigned long qualification, paddr_t gpa) +{ + unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK; + struct domain *d = current->domain; + unsigned long gfn = gpa >> PAGE_SHIFT; + mfn_t mfn; + p2m_type_t t; + + if ( unlikely(qualification & EPT_GAW_VIOLATION) ) + { + gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr + " exceeded its width limit.\n", gpa); + goto crash; + } + + if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) || + unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) ) + { + gdprintk(XENLOG_ERR, "EPT violation: reserved bit or " + "pdptr load violation.\n"); + goto crash; + } + + mfn = gfn_to_mfn(d, gfn, &t); + if ( p2m_is_ram(t) && paging_mode_log_dirty(d) ) + { + paging_mark_dirty(d, mfn_x(mfn)); + p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw); + flush_tlb_mask(d->domain_dirty_cpumask); + return; + } + + /* This can only happen in log-dirty mode, writing back A/D bits. */ + if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) ) + goto crash; + + ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH); + handle_mmio(); + + return; + + crash: + domain_crash(d); +} + static void vmx_failed_vmentry(unsigned int exit_reason, struct cpu_user_regs *regs) { @@ -1939,6 +2135,10 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) unsigned long exit_qualification, inst_len = 0; struct vcpu *v = current; + if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) ) + v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] = + __vmread(GUEST_CR3); + exit_reason = __vmread(VM_EXIT_REASON); hvmtrace_vmexit(v, regs->eip, exit_reason); @@ -2171,6 +2371,17 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) break; } + case EXIT_REASON_EPT_VIOLATION: + { + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS); +#ifdef CONFIG_X86_PAE + gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32; +#endif + exit_qualification = __vmread(EXIT_QUALIFICATION); + ept_handle_violation(exit_qualification, gpa); + break; + } + default: exit_and_crash: gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason); diff --git a/xen/arch/x86/hvm/vmx/x86_32/exits.S b/xen/arch/x86/hvm/vmx/x86_32/exits.S index 11db8cfc21..eff089a112 100644 --- a/xen/arch/x86/hvm/vmx/x86_32/exits.S +++ b/xen/arch/x86/hvm/vmx/x86_32/exits.S @@ -129,7 +129,6 @@ ENTRY(vmx_asm_do_vmentry) /*vmx_resume:*/ HVM_RESTORE_ALL_NOSEGREGS VMRESUME - pushf call vm_resume_fail ud2 @@ -137,7 +136,6 @@ vmx_launch: movb $1,VCPU_vmx_launched(%ebx) HVM_RESTORE_ALL_NOSEGREGS VMLAUNCH - pushf call vm_launch_fail ud2 diff --git a/xen/arch/x86/hvm/vmx/x86_64/exits.S b/xen/arch/x86/hvm/vmx/x86_64/exits.S index 48da4869bd..56fdb8ad54 100644 --- a/xen/arch/x86/hvm/vmx/x86_64/exits.S +++ b/xen/arch/x86/hvm/vmx/x86_64/exits.S @@ -148,7 +148,6 @@ ENTRY(vmx_asm_do_vmentry) /*vmx_resume:*/ HVM_RESTORE_ALL_NOSEGREGS VMRESUME - pushfq call vm_resume_fail ud2 @@ -156,7 +155,6 @@ vmx_launch: movb $1,VCPU_vmx_launched(%rbx) HVM_RESTORE_ALL_NOSEGREGS VMLAUNCH - pushfq call vm_launch_fail ud2 diff --git a/xen/arch/x86/hvm/vpic.c b/xen/arch/x86/hvm/vpic.c index ce3943eaab..a3d6f2d9ca 100644 --- a/xen/arch/x86/hvm/vpic.c +++ b/xen/arch/x86/hvm/vpic.c @@ -319,7 +319,7 @@ static int vpic_intercept_pic_io( if ( bytes != 1 ) { gdprintk(XENLOG_WARNING, "PIC_IO bad access size %d\n", bytes); - return 1; + return X86EMUL_OKAY; } vpic = ¤t->domain->arch.hvm_domain.vpic[port >> 7]; @@ -329,7 +329,7 @@ static int vpic_intercept_pic_io( else *val = (uint8_t)vpic_ioport_read(vpic, port); - return 1; + return X86EMUL_OKAY; } static int vpic_intercept_elcr_io( @@ -338,11 +338,7 @@ static int vpic_intercept_elcr_io( struct hvm_hw_vpic *vpic; uint32_t data; - if ( bytes != 1 ) - { - gdprintk(XENLOG_WARNING, "PIC_IO bad access size %d\n", bytes); - return 1; - } + BUG_ON(bytes != 1); vpic = ¤t->domain->arch.hvm_domain.vpic[port & 1]; @@ -360,34 +356,8 @@ static int vpic_intercept_elcr_io( *val = vpic->elcr & vpic_elcr_mask(vpic); } - return 1; -} - -#ifdef HVM_DEBUG_SUSPEND -static void vpic_info(struct hvm_hw_vpic *s) -{ - printk("*****pic state:*****\n"); - printk("pic 0x%x.\n", s->irr); - printk("pic 0x%x.\n", s->imr); - printk("pic 0x%x.\n", s->isr); - printk("pic 0x%x.\n", s->irq_base); - printk("pic 0x%x.\n", s->init_state); - printk("pic 0x%x.\n", s->priority_add); - printk("pic 0x%x.\n", s->readsel_isr); - printk("pic 0x%x.\n", s->poll); - printk("pic 0x%x.\n", s->auto_eoi); - printk("pic 0x%x.\n", s->rotate_on_auto_eoi); - printk("pic 0x%x.\n", s->special_fully_nested_mode); - printk("pic 0x%x.\n", s->special_mask_mode); - printk("pic 0x%x.\n", s->elcr); - printk("pic 0x%x.\n", s->int_output); - printk("pic 0x%x.\n", s->is_master); -} -#else -static void vpic_info(struct hvm_hw_vpic *s) -{ + return X86EMUL_OKAY; } -#endif static int vpic_save(struct domain *d, hvm_domain_context_t *h) { @@ -398,7 +368,6 @@ static int vpic_save(struct domain *d, hvm_domain_context_t *h) for ( i = 0; i < 2 ; i++ ) { s = &d->arch.hvm_domain.vpic[i]; - vpic_info(s); if ( hvm_save_entry(PIC, i, h, s) ) return 1; } @@ -421,7 +390,6 @@ static int vpic_load(struct domain *d, hvm_domain_context_t *h) if ( hvm_load_entry(PIC, h, s) != 0 ) return -EINVAL; - vpic_info(s); return 0; } diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c index 9ccbefd22a..b7e50ae8f1 100644 --- a/xen/arch/x86/io_apic.c +++ b/xen/arch/x86/io_apic.c @@ -1244,7 +1244,11 @@ static void __init setup_ioapic_ids_from_mpc(void) { } */ static int __init timer_irq_works(void) { - unsigned long t1 = jiffies; + extern unsigned long pit0_ticks; + unsigned long t1; + + t1 = pit0_ticks; + mb(); local_irq_enable(); /* Let ten ticks pass... */ @@ -1257,7 +1261,8 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + mb(); + if (pit0_ticks - t1 > 4) return 1; return 0; diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index a1220af3b3..15f2cf57eb 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -299,7 +299,7 @@ int memory_is_conventional_ram(paddr_t p) unsigned long domain_get_maximum_gpfn(struct domain *d) { if ( is_hvm_domain(d) ) - return d->arch.p2m.max_mapped_pfn; + return d->arch.p2m->max_mapped_pfn; /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */ return arch_get_max_pfn(d) - 1; } @@ -476,7 +476,7 @@ static void invalidate_shadow_ldt(struct vcpu *v) if ( pfn == 0 ) continue; l1e_write(&v->arch.perdomain_ptes[i], l1e_empty()); page = mfn_to_page(pfn); - ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); + ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page); ASSERT_PAGE_IS_DOMAIN(page, v->domain); put_page_and_type(page); } @@ -530,7 +530,7 @@ int map_ldt_shadow_page(unsigned int off) if ( unlikely(!mfn_valid(mfn)) ) return 0; - okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); + okay = get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page); if ( unlikely(!okay) ) return 0; @@ -924,7 +924,7 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) { /* We expect this is rare so we blow the entire shadow LDT. */ if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == - PGT_ldt_page)) && + PGT_seg_desc_page)) && unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) && (d == e) ) { @@ -1748,8 +1748,7 @@ static int alloc_page_type(struct page_info *page, unsigned long type) return alloc_l3_table(page); case PGT_l4_page_table: return alloc_l4_table(page); - case PGT_gdt_page: - case PGT_ldt_page: + case PGT_seg_desc_page: return alloc_segdesc_page(page); default: printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", @@ -2189,7 +2188,7 @@ int do_mmuext_op( goto out; } - LOCK_BIGLOCK(d); + domain_lock(d); for ( i = 0; i < count; i++ ) { @@ -2438,7 +2437,7 @@ int do_mmuext_op( process_deferred_ops(); - UNLOCK_BIGLOCK(d); + domain_unlock(d); perfc_add(num_mmuext_ops, i); @@ -2493,7 +2492,7 @@ int do_mmu_update( domain_mmap_cache_init(&mapcache); - LOCK_BIGLOCK(d); + domain_lock(d); for ( i = 0; i < count; i++ ) { @@ -2665,7 +2664,7 @@ int do_mmu_update( process_deferred_ops(); - UNLOCK_BIGLOCK(d); + domain_unlock(d); domain_mmap_cache_destroy(&mapcache); @@ -2694,7 +2693,7 @@ static int create_grant_pte_mapping( l1_pgentry_t ol1e; struct domain *d = v->domain; - ASSERT(spin_is_locked(&d->big_lock)); + ASSERT(domain_is_locked(d)); adjust_guest_l1e(nl1e, d); @@ -2817,7 +2816,7 @@ static int create_grant_va_mapping( unsigned long gl1mfn; int okay; - ASSERT(spin_is_locked(&d->big_lock)); + ASSERT(domain_is_locked(d)); adjust_guest_l1e(nl1e, d); @@ -3015,7 +3014,7 @@ int do_update_va_mapping(unsigned long va, u64 val64, if ( rc ) return rc; - LOCK_BIGLOCK(d); + domain_lock(d); pl1e = guest_map_l1e(v, va, &gl1mfn); @@ -3028,7 +3027,7 @@ int do_update_va_mapping(unsigned long va, u64 val64, process_deferred_ops(); - UNLOCK_BIGLOCK(d); + domain_unlock(d); switch ( flags & UVMF_FLUSHTYPE_MASK ) { @@ -3134,7 +3133,7 @@ long set_gdt(struct vcpu *v, { mfn = frames[i] = gmfn_to_mfn(d, frames[i]); if ( !mfn_valid(mfn) || - !get_page_and_type(mfn_to_page(mfn), d, PGT_gdt_page) ) + !get_page_and_type(mfn_to_page(mfn), d, PGT_seg_desc_page) ) goto fail; } @@ -3173,12 +3172,12 @@ long do_set_gdt(XEN_GUEST_HANDLE(ulong) frame_list, unsigned int entries) if ( copy_from_guest(frames, frame_list, nr_pages) ) return -EFAULT; - LOCK_BIGLOCK(curr->domain); + domain_lock(curr->domain); if ( (ret = set_gdt(curr, frames, entries)) == 0 ) flush_tlb_local(); - UNLOCK_BIGLOCK(curr->domain); + domain_unlock(curr->domain); return ret; } @@ -3211,12 +3210,8 @@ long do_update_descriptor(u64 pa, u64 desc) /* Check if the given frame is in use in an unsafe context. */ switch ( page->u.inuse.type_info & PGT_type_mask ) { - case PGT_gdt_page: - if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) - goto out; - break; - case PGT_ldt_page: - if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) + case PGT_seg_desc_page: + if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) ) goto out; break; default: @@ -3316,7 +3311,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg) return -EINVAL; } - LOCK_BIGLOCK(d); + domain_lock(d); /* Remove previously mapped page if it was present. */ prev_mfn = gmfn_to_mfn(d, xatp.gpfn); @@ -3338,7 +3333,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg) /* Map at new location. */ guest_physmap_add_page(d, xatp.gpfn, mfn); - UNLOCK_BIGLOCK(d); + domain_unlock(d); rcu_unlock_domain(d); @@ -3674,7 +3669,7 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, struct ptwr_emulate_ctxt ptwr_ctxt; int rc; - LOCK_BIGLOCK(d); + domain_lock(d); /* Attempt to read the PTE that maps the VA being accessed. */ guest_get_eff_l1e(v, addr, &pte); @@ -3699,12 +3694,12 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, if ( rc == X86EMUL_UNHANDLEABLE ) goto bail; - UNLOCK_BIGLOCK(d); + domain_unlock(d); perfc_incr(ptwr_emulations); return EXCRET_fault_fixed; bail: - UNLOCK_BIGLOCK(d); + domain_unlock(d); return 0; } diff --git a/xen/arch/x86/mm/hap/Makefile b/xen/arch/x86/mm/hap/Makefile index 160e5f36bf..64cb72786e 100644 --- a/xen/arch/x86/mm/hap/Makefile +++ b/xen/arch/x86/mm/hap/Makefile @@ -2,6 +2,7 @@ obj-y += hap.o obj-y += guest_walk_2level.o obj-y += guest_walk_3level.o obj-y += guest_walk_4level.o +obj-y += p2m-ept.o guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1))))) guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c index 15cdc23c96..e30acf6948 100644 --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -38,6 +38,7 @@ #include <asm/hap.h> #include <asm/paging.h> #include <asm/domain.h> +#include <xen/numa.h> #include "private.h" @@ -61,7 +62,7 @@ int hap_enable_log_dirty(struct domain *d) hap_unlock(d); /* set l1e entries of P2M table to be read-only. */ - p2m_change_type_global(d, p2m_ram_rw, p2m_ram_logdirty); + p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); return 0; } @@ -73,14 +74,14 @@ int hap_disable_log_dirty(struct domain *d) hap_unlock(d); /* set l1e entries of P2M table with normal mode */ - p2m_change_type_global(d, p2m_ram_logdirty, p2m_ram_rw); + p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw); return 0; } void hap_clean_dirty_bitmap(struct domain *d) { /* set l1e entries of P2M table to be read-only. */ - p2m_change_type_global(d, p2m_ram_rw, p2m_ram_logdirty); + p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); } @@ -135,7 +136,8 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d) && mfn_x(page_to_mfn(pg)) >= (1UL << (32 - PAGE_SHIFT)) ) { free_domheap_page(pg); - pg = alloc_domheap_pages(NULL, 0, MEMF_bits(32)); + pg = alloc_domheap_page( + NULL, MEMF_bits(32) | MEMF_node(domain_to_node(d))); if ( likely(pg != NULL) ) { void *p = hap_map_domain_page(page_to_mfn(pg)); @@ -199,7 +201,7 @@ hap_set_allocation(struct domain *d, unsigned int pages, int *preempted) if ( d->arch.paging.hap.total_pages < pages ) { /* Need to allocate more memory from domheap */ - pg = alloc_domheap_page(NULL); + pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( pg == NULL ) { HAP_PRINTK("failed to allocate hap pages.\n"); diff --git a/xen/arch/x86/mm/hap/p2m-ept.c b/xen/arch/x86/mm/hap/p2m-ept.c new file mode 100644 index 0000000000..697ca4d697 --- /dev/null +++ b/xen/arch/x86/mm/hap/p2m-ept.c @@ -0,0 +1,257 @@ +/* + * ept-p2m.c: use the EPT page table as p2m + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <xen/config.h> +#include <xen/domain_page.h> +#include <xen/sched.h> +#include <asm/current.h> +#include <asm/types.h> +#include <asm/domain.h> +#include <asm/p2m.h> +#include <asm/hvm/vmx/vmx.h> +#include <xen/iommu.h> + +static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type) +{ + switch(type) + { + case p2m_invalid: + case p2m_mmio_dm: + default: + return; + case p2m_ram_rw: + case p2m_mmio_direct: + entry->r = entry->w = entry->x = 1; + return; + case p2m_ram_logdirty: + case p2m_ram_ro: + entry->r = entry->x = 1; + entry->w = 0; + return; + } +} + +static int ept_next_level(struct domain *d, bool_t read_only, + ept_entry_t **table, unsigned long *gfn_remainder, + u32 shift) +{ + ept_entry_t *ept_entry, *next; + u32 index; + + index = *gfn_remainder >> shift; + *gfn_remainder &= (1UL << shift) - 1; + + ept_entry = (*table) + index; + + if ( !(ept_entry->epte & 0x7) ) + { + struct page_info *pg; + + if ( read_only ) + return 0; + + pg = d->arch.p2m->alloc_page(d); + if ( pg == NULL ) + return 0; + + pg->count_info = 1; + pg->u.inuse.type_info = 1 | PGT_validated; + list_add_tail(&pg->list, &d->arch.p2m->pages); + + ept_entry->emt = 0; + ept_entry->sp_avail = 0; + ept_entry->avail1 = 0; + ept_entry->mfn = page_to_mfn(pg); + ept_entry->rsvd = 0; + ept_entry->avail2 = 0; + /* last step */ + ept_entry->r = ept_entry->w = ept_entry->x = 1; + } + + next = map_domain_page(ept_entry->mfn); + unmap_domain_page(*table); + *table = next; + + return 1; +} + +static int +ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) +{ + ept_entry_t *table = + map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); + unsigned long gfn_remainder = gfn; + ept_entry_t *ept_entry = NULL; + u32 index; + int i, rv = 0; + + /* Should check if gfn obeys GAW here */ + + for ( i = EPT_DEFAULT_GAW; i > 0; i-- ) + if ( !ept_next_level(d, 0, &table, &gfn_remainder, + i * EPT_TABLE_ORDER) ) + goto out; + + index = gfn_remainder; + ept_entry = table + index; + + if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) ) + { + /* Track the highest gfn for which we have ever had a valid mapping */ + if ( gfn > d->arch.p2m->max_mapped_pfn ) + d->arch.p2m->max_mapped_pfn = gfn; + + ept_entry->emt = EPT_DEFAULT_MT; + ept_entry->sp_avail = 0; + ept_entry->avail1 = p2mt; + ept_entry->mfn = mfn_x(mfn); + ept_entry->rsvd = 0; + ept_entry->avail2 = 0; + /* last step */ + ept_entry->r = ept_entry->w = ept_entry->x = 1; + ept_p2m_type_to_flags(ept_entry, p2mt); + } + else + ept_entry->epte = 0; + + /* Success */ + rv = 1; + + out: + unmap_domain_page(table); + + ept_sync_domain(d); + + /* If p2m table is shared with vtd page-table. */ + if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) ) + iommu_flush(d, gfn, (u64*)ept_entry); + + return rv; +} + +/* Read ept p2m entries */ +static mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t) +{ + ept_entry_t *table = + map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); + unsigned long gfn_remainder = gfn; + ept_entry_t *ept_entry; + u32 index; + int i; + mfn_t mfn = _mfn(INVALID_MFN); + + *t = p2m_mmio_dm; + + /* This pfn is higher than the highest the p2m map currently holds */ + if ( gfn > d->arch.p2m->max_mapped_pfn ) + goto out; + + /* Should check if gfn obeys GAW here. */ + + for ( i = EPT_DEFAULT_GAW; i > 0; i-- ) + if ( !ept_next_level(d, 1, &table, &gfn_remainder, + i * EPT_TABLE_ORDER) ) + goto out; + + index = gfn_remainder; + ept_entry = table + index; + + if ( ept_entry->avail1 != p2m_invalid ) + { + *t = ept_entry->avail1; + mfn = _mfn(ept_entry->mfn); + } + + out: + unmap_domain_page(table); + return mfn; +} + +static mfn_t ept_get_entry_current(unsigned long gfn, p2m_type_t *t) +{ + return ept_get_entry(current->domain, gfn, t); +} + +/* Walk the whole p2m table, changing any entries of the old type + * to the new type. This is used in hardware-assisted paging to + * quickly enable or diable log-dirty tracking */ + +static void ept_change_entry_type_global(struct domain *d, + p2m_type_t ot, p2m_type_t nt) +{ + ept_entry_t *l4e, *l3e, *l2e, *l1e; + int i4, i3, i2, i1; + + if ( pagetable_get_pfn(d->arch.phys_table) == 0 ) + return; + + BUG_ON(EPT_DEFAULT_GAW != 3); + + l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); + for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ ) + { + if ( !l4e[i4].epte || l4e[i4].sp_avail ) + continue; + l3e = map_domain_page(l4e[i4].mfn); + for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ ) + { + if ( !l3e[i3].epte || l3e[i3].sp_avail ) + continue; + l2e = map_domain_page(l3e[i3].mfn); + for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ ) + { + if ( !l2e[i2].epte || l2e[i2].sp_avail ) + continue; + l1e = map_domain_page(l2e[i2].mfn); + for ( i1 = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ ) + { + if ( !l1e[i1].epte ) + continue; + if ( l1e[i1].avail1 != ot ) + continue; + l1e[i1].avail1 = nt; + ept_p2m_type_to_flags(l1e+i1, nt); + } + unmap_domain_page(l1e); + } + unmap_domain_page(l2e); + } + unmap_domain_page(l3e); + } + unmap_domain_page(l4e); + + ept_sync_domain(d); +} + +void ept_p2m_init(struct domain *d) +{ + d->arch.p2m->set_entry = ept_set_entry; + d->arch.p2m->get_entry = ept_get_entry; + d->arch.p2m->get_entry_current = ept_get_entry_current; + d->arch.p2m->change_entry_type_global = ept_change_entry_type_global; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index e8298fb3bd..faee13955e 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -27,6 +27,7 @@ #include <asm/page.h> #include <asm/paging.h> #include <asm/p2m.h> +#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */ #include <xen/iommu.h> /* Debugging and auditing of the P2M code? */ @@ -41,36 +42,37 @@ * Locking discipline: always acquire this lock before the shadow or HAP one */ -#define p2m_lock_init(_d) \ - do { \ - spin_lock_init(&(_d)->arch.p2m.lock); \ - (_d)->arch.p2m.locker = -1; \ - (_d)->arch.p2m.locker_function = "nobody"; \ +#define p2m_lock_init(_p2m) \ + do { \ + spin_lock_init(&(_p2m)->lock); \ + (_p2m)->locker = -1; \ + (_p2m)->locker_function = "nobody"; \ } while (0) -#define p2m_lock(_d) \ - do { \ - if ( unlikely((_d)->arch.p2m.locker == current->processor) )\ - { \ - printk("Error: p2m lock held by %s\n", \ - (_d)->arch.p2m.locker_function); \ - BUG(); \ - } \ - spin_lock(&(_d)->arch.p2m.lock); \ - ASSERT((_d)->arch.p2m.locker == -1); \ - (_d)->arch.p2m.locker = current->processor; \ - (_d)->arch.p2m.locker_function = __func__; \ +#define p2m_lock(_p2m) \ + do { \ + if ( unlikely((_p2m)->locker == current->processor) ) \ + { \ + printk("Error: p2m lock held by %s\n", \ + (_p2m)->locker_function); \ + BUG(); \ + } \ + spin_lock(&(_p2m)->lock); \ + ASSERT((_p2m)->locker == -1); \ + (_p2m)->locker = current->processor; \ + (_p2m)->locker_function = __func__; \ } while (0) -#define p2m_unlock(_d) \ - do { \ - ASSERT((_d)->arch.p2m.locker == current->processor); \ - (_d)->arch.p2m.locker = -1; \ - (_d)->arch.p2m.locker_function = "nobody"; \ - spin_unlock(&(_d)->arch.p2m.lock); \ +#define p2m_unlock(_p2m) \ + do { \ + ASSERT((_p2m)->locker == current->processor); \ + (_p2m)->locker = -1; \ + (_p2m)->locker_function = "nobody"; \ + spin_unlock(&(_p2m)->lock); \ } while (0) - +#define p2m_locked_by_me(_p2m) \ + (current->processor == (_p2m)->locker) /* Printouts */ #define P2M_PRINTK(_f, _a...) \ @@ -152,7 +154,7 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, l1_pgentry_t *p2m_entry; l1_pgentry_t new_entry; void *next; - ASSERT(d->arch.p2m.alloc_page); + ASSERT(d->arch.p2m->alloc_page); if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, shift, max)) ) @@ -160,10 +162,10 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) { - struct page_info *pg = d->arch.p2m.alloc_page(d); + struct page_info *pg = d->arch.p2m->alloc_page(d); if ( pg == NULL ) return 0; - list_add_tail(&pg->list, &d->arch.p2m.pages); + list_add_tail(&pg->list, &d->arch.p2m->pages); pg->u.inuse.type_info = type | 1 | PGT_validated; pg->count_info = 1; @@ -202,7 +204,7 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, // Returns 0 on error (out of memory) static int -set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) +p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) { // XXX -- this might be able to be faster iff current->domain == d mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); @@ -244,8 +246,8 @@ set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) ASSERT(p2m_entry); /* Track the highest gfn for which we have ever had a valid mapping */ - if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) ) - d->arch.p2m.max_mapped_pfn = gfn; + if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) ) + d->arch.p2m->max_mapped_pfn = gfn; if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ) entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt)); @@ -279,14 +281,170 @@ set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) return rv; } +static mfn_t +p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t) +{ + mfn_t mfn; + paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(paging_mode_translate(d)); + + /* XXX This is for compatibility with the old model, where anything not + * XXX marked as RAM was considered to be emulated MMIO space. + * XXX Once we start explicitly registering MMIO regions in the p2m + * XXX we will return p2m_invalid for unmapped gfns */ + *t = p2m_mmio_dm; + + mfn = pagetable_get_mfn(d->arch.phys_table); + + if ( gfn > d->arch.p2m->max_mapped_pfn ) + /* This pfn is higher than the highest the p2m map currently holds */ + return _mfn(INVALID_MFN); + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + unmap_domain_page(l4e); + } +#endif +#if CONFIG_PAGING_LEVELS >= 3 + { + l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); +#if CONFIG_PAGING_LEVELS == 3 + /* On PAE hosts the p2m has eight l3 entries, not four (see + * shadow_set_p2m_entry()) so we can't use l3_table_offset. + * Instead, just count the number of l3es from zero. It's safe + * to do this because we already checked that the gfn is within + * the bounds of the p2m. */ + l3e += (addr >> L3_PAGETABLE_SHIFT); +#else + l3e += l3_table_offset(addr); +#endif + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l3e_get_pfn(*l3e)); + unmap_domain_page(l3e); + } +#endif + + l2e = map_domain_page(mfn_x(mfn)); + l2e += l2_table_offset(addr); + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l2e_get_pfn(*l2e)); + unmap_domain_page(l2e); + + l1e = map_domain_page(mfn_x(mfn)); + l1e += l1_table_offset(addr); + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + *t = p2m_flags_to_type(l1e_get_flags(*l1e)); + unmap_domain_page(l1e); + + ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); + return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); +} + +/* Read the current domain's p2m table (through the linear mapping). */ +static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t) +{ + mfn_t mfn = _mfn(INVALID_MFN); + p2m_type_t p2mt = p2m_mmio_dm; + /* XXX This is for compatibility with the old model, where anything not + * XXX marked as RAM was considered to be emulated MMIO space. + * XXX Once we start explicitly registering MMIO regions in the p2m + * XXX we will return p2m_invalid for unmapped gfns */ + + if ( gfn <= current->domain->arch.p2m->max_mapped_pfn ) + { + l1_pgentry_t l1e = l1e_empty(); + int ret; + + ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) + / sizeof(l1_pgentry_t)); + + /* Need to __copy_from_user because the p2m is sparse and this + * part might not exist */ + ret = __copy_from_user(&l1e, + &phys_to_machine_mapping[gfn], + sizeof(l1e)); + + if ( ret == 0 ) { + p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); + ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); + if ( p2m_is_valid(p2mt) ) + mfn = _mfn(l1e_get_pfn(l1e)); + else + /* XXX see above */ + p2mt = p2m_mmio_dm; + } + } + + *t = p2mt; + return mfn; +} /* Init the datastructures for later use by the p2m code */ -void p2m_init(struct domain *d) +int p2m_init(struct domain *d) { - p2m_lock_init(d); - INIT_LIST_HEAD(&d->arch.p2m.pages); + struct p2m_domain *p2m; + + p2m = xmalloc(struct p2m_domain); + if ( p2m == NULL ) + return -ENOMEM; + + d->arch.p2m = p2m; + + memset(p2m, 0, sizeof(*p2m)); + p2m_lock_init(p2m); + INIT_LIST_HEAD(&p2m->pages); + + p2m->set_entry = p2m_set_entry; + p2m->get_entry = p2m_gfn_to_mfn; + p2m->get_entry_current = p2m_gfn_to_mfn_current; + p2m->change_entry_type_global = p2m_change_type_global; + + if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ) + ept_p2m_init(d); + + return 0; } +void p2m_change_entry_type_global(struct domain *d, + p2m_type_t ot, p2m_type_t nt) +{ + struct p2m_domain *p2m = d->arch.p2m; + + p2m_lock(p2m); + p2m->change_entry_type_global(d, ot, nt); + p2m_unlock(p2m); +} + +static inline +int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) +{ + return d->arch.p2m->set_entry(d, gfn, mfn, p2mt); +} // Allocate a new p2m table for a domain. // @@ -308,28 +466,29 @@ int p2m_alloc_table(struct domain *d, struct page_info *page, *p2m_top; unsigned int page_count = 0; unsigned long gfn = -1UL; + struct p2m_domain *p2m = d->arch.p2m; - p2m_lock(d); + p2m_lock(p2m); if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) { P2M_ERROR("p2m already allocated for this domain\n"); - p2m_unlock(d); + p2m_unlock(p2m); return -EINVAL; } P2M_PRINTK("allocating p2m table\n"); - d->arch.p2m.alloc_page = alloc_page; - d->arch.p2m.free_page = free_page; + p2m->alloc_page = alloc_page; + p2m->free_page = free_page; - p2m_top = d->arch.p2m.alloc_page(d); + p2m_top = p2m->alloc_page(d); if ( p2m_top == NULL ) { - p2m_unlock(d); + p2m_unlock(p2m); return -ENOMEM; } - list_add_tail(&p2m_top->list, &d->arch.p2m.pages); + list_add_tail(&p2m_top->list, &p2m->pages); p2m_top->count_info = 1; p2m_top->u.inuse.type_info = @@ -376,13 +535,13 @@ int p2m_alloc_table(struct domain *d, #endif P2M_PRINTK("p2m table initialised (%u pages)\n", page_count); - p2m_unlock(d); + p2m_unlock(p2m); return 0; error: P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" PRI_mfn "\n", gfn, mfn_x(mfn)); - p2m_unlock(d); + p2m_unlock(p2m); return -ENOMEM; } @@ -392,101 +551,24 @@ void p2m_teardown(struct domain *d) { struct list_head *entry, *n; struct page_info *pg; + struct p2m_domain *p2m = d->arch.p2m; - p2m_lock(d); + p2m_lock(p2m); d->arch.phys_table = pagetable_null(); - list_for_each_safe(entry, n, &d->arch.p2m.pages) + list_for_each_safe(entry, n, &p2m->pages) { pg = list_entry(entry, struct page_info, list); list_del(entry); - d->arch.p2m.free_page(d, pg); + p2m->free_page(d, pg); } - p2m_unlock(d); + p2m_unlock(p2m); } -mfn_t -gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t) -/* Read another domain's p2m entries */ +void p2m_final_teardown(struct domain *d) { - mfn_t mfn; - paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - - ASSERT(paging_mode_translate(d)); - - /* XXX This is for compatibility with the old model, where anything not - * XXX marked as RAM was considered to be emulated MMIO space. - * XXX Once we start explicitly registering MMIO regions in the p2m - * XXX we will return p2m_invalid for unmapped gfns */ - *t = p2m_mmio_dm; - - mfn = pagetable_get_mfn(d->arch.phys_table); - - if ( gfn > d->arch.p2m.max_mapped_pfn ) - /* This pfn is higher than the highest the p2m map currently holds */ - return _mfn(INVALID_MFN); - -#if CONFIG_PAGING_LEVELS >= 4 - { - l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); - l4e += l4_table_offset(addr); - if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l4e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l4e_get_pfn(*l4e)); - unmap_domain_page(l4e); - } -#endif -#if CONFIG_PAGING_LEVELS >= 3 - { - l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); -#if CONFIG_PAGING_LEVELS == 3 - /* On PAE hosts the p2m has eight l3 entries, not four (see - * shadow_set_p2m_entry()) so we can't use l3_table_offset. - * Instead, just count the number of l3es from zero. It's safe - * to do this because we already checked that the gfn is within - * the bounds of the p2m. */ - l3e += (addr >> L3_PAGETABLE_SHIFT); -#else - l3e += l3_table_offset(addr); -#endif - if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l3e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l3e_get_pfn(*l3e)); - unmap_domain_page(l3e); - } -#endif - - l2e = map_domain_page(mfn_x(mfn)); - l2e += l2_table_offset(addr); - if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l2e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l2e_get_pfn(*l2e)); - unmap_domain_page(l2e); - - l1e = map_domain_page(mfn_x(mfn)); - l1e += l1_table_offset(addr); - if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l1e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l1e_get_pfn(*l1e)); - *t = p2m_flags_to_type(l1e_get_flags(*l1e)); - unmap_domain_page(l1e); - - ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); - return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); + xfree(d->arch.p2m); + d->arch.p2m = NULL; } #if P2M_AUDIT @@ -564,7 +646,7 @@ static void audit_p2m(struct domain *d) set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); } - if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) ) + if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) ) { lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type)); if ( lp2mfn != mfn_x(p2mfn) ) @@ -695,11 +777,11 @@ void guest_physmap_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) { - p2m_lock(d); + p2m_lock(d->arch.p2m); audit_p2m(d); p2m_remove_page(d, gfn, mfn); audit_p2m(d); - p2m_unlock(d); + p2m_unlock(d->arch.p2m); } int @@ -722,7 +804,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn, */ if ( paging_mode_hap(d) && (gfn > 0xfffffUL) ) { - if ( !test_and_set_bool(d->arch.hvm_domain.amd_npt_4gb_warning) ) + if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) ) dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond" " 4GB: specify 'hap=0' domain config option.\n", d->domain_id); @@ -730,7 +812,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn, } #endif - p2m_lock(d); + p2m_lock(d->arch.p2m); audit_p2m(d); P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn); @@ -781,7 +863,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn, } audit_p2m(d); - p2m_unlock(d); + p2m_unlock(d->arch.p2m); return rc; } @@ -812,7 +894,7 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) if ( pagetable_get_pfn(d->arch.phys_table) == 0 ) return; - p2m_lock(d); + ASSERT(p2m_locked_by_me(d->arch.p2m)); #if CONFIG_PAGING_LEVELS == 4 l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); @@ -860,7 +942,7 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) mfn = l1e_get_pfn(l1e[i1]); gfn = get_gpfn_from_mfn(mfn); /* create a new 1le entry with the new type */ - flags = p2m_flags_to_type(nt); + flags = p2m_type_to_flags(nt); l1e_content = l1e_from_pfn(mfn, flags); paging_write_p2m_entry(d, gfn, &l1e[i1], l1mfn, l1e_content, 1); @@ -884,7 +966,6 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) unmap_domain_page(l2e); #endif - p2m_unlock(d); } /* Modify the p2m type of a single gfn from ot to nt, returning the @@ -895,13 +976,13 @@ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, p2m_type_t pt; mfn_t mfn; - p2m_lock(d); + p2m_lock(d->arch.p2m); mfn = gfn_to_mfn(d, gfn, &pt); if ( pt == ot ) set_p2m_entry(d, gfn, mfn, nt); - p2m_unlock(d); + p2m_unlock(d->arch.p2m); return pt; } diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c index e6c3cbb9e6..2247d8dd68 100644 --- a/xen/arch/x86/mm/paging.c +++ b/xen/arch/x86/mm/paging.c @@ -26,6 +26,7 @@ #include <asm/p2m.h> #include <asm/hap.h> #include <asm/guest_access.h> +#include <xen/numa.h> #include <xsm/xsm.h> #define hap_enabled(d) (is_hvm_domain(d) && (d)->arch.hvm_domain.hap_enabled) @@ -99,8 +100,9 @@ static mfn_t paging_new_log_dirty_page(struct domain *d, void **mapping_p) { mfn_t mfn; - struct page_info *page = alloc_domheap_page(NULL); + struct page_info *page; + page = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( unlikely(page == NULL) ) { d->arch.paging.log_dirty.failed_allocs++; @@ -482,9 +484,12 @@ void paging_log_dirty_teardown(struct domain*d) /* CODE FOR PAGING SUPPORT */ /************************************************/ /* Domain paging struct initialization. */ -void paging_domain_init(struct domain *d) +int paging_domain_init(struct domain *d) { - p2m_init(d); + int rc; + + if ( (rc = p2m_init(d)) != 0 ) + return rc; /* The order of the *_init calls below is important, as the later * ones may rewrite some common fields. Shadow pagetables are the @@ -494,6 +499,8 @@ void paging_domain_init(struct domain *d) /* ... but we will use hardware assistance if it's available. */ if ( hap_enabled(d) ) hap_domain_init(d); + + return 0; } /* vcpu paging struct initialization goes here */ @@ -587,6 +594,8 @@ void paging_final_teardown(struct domain *d) hap_final_teardown(d); else shadow_final_teardown(d); + + p2m_final_teardown(d); } /* Enable an arbitrary paging-assistance mode. Call once at domain diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c index e4a04bb456..d7239cde77 100644 --- a/xen/arch/x86/mm/shadow/common.c +++ b/xen/arch/x86/mm/shadow/common.c @@ -36,6 +36,7 @@ #include <asm/current.h> #include <asm/flushtlb.h> #include <asm/shadow.h> +#include <xen/numa.h> #include "private.h" @@ -1249,7 +1250,7 @@ static unsigned int sh_set_allocation(struct domain *d, { /* Need to allocate more memory from domheap */ sp = (struct shadow_page_info *) - alloc_domheap_pages(NULL, order, 0); + alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d))); if ( sp == NULL ) { SHADOW_PRINTK("failed to allocate shadow pages.\n"); @@ -2171,13 +2172,12 @@ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all) #undef DO_UNSHADOW /* If that didn't catch the shadows, something is wrong */ - if ( !fast && (pg->count_info & PGC_page_table) ) + if ( !fast && all && (pg->count_info & PGC_page_table) ) { SHADOW_ERROR("can't find all shadows of mfn %05lx " "(shadow_flags=%08lx)\n", mfn_x(gmfn), pg->shadow_flags); - if ( all ) - domain_crash(v->domain); + domain_crash(v->domain); } /* Need to flush TLBs now, so that linear maps are safe next time we diff --git a/xen/arch/x86/pci.c b/xen/arch/x86/pci.c new file mode 100644 index 0000000000..341457b4bc --- /dev/null +++ b/xen/arch/x86/pci.c @@ -0,0 +1,118 @@ +/****************************************************************************** + * pci.c + * + * PCI access functions. + */ + +#include <xen/config.h> +#include <xen/pci.h> +#include <xen/spinlock.h> +#include <asm/io.h> + +#define PCI_CONF_ADDRESS(bus, dev, func, reg) \ + (0x80000000 | (bus << 16) | (dev << 11) | (func << 8) | (reg & ~3)) + +static DEFINE_SPINLOCK(pci_config_lock); + +uint32_t pci_conf_read(uint32_t cf8, uint8_t offset, uint8_t bytes) +{ + unsigned long flags; + uint32_t value; + + BUG_ON((offset + bytes) > 4); + + spin_lock_irqsave(&pci_config_lock, flags); + + outl(cf8, 0xcf8); + + switch ( bytes ) + { + case 1: + value = inb(0xcfc + offset); + break; + case 2: + value = inw(0xcfc + offset); + break; + case 4: + value = inl(0xcfc + offset); + break; + default: + value = 0; + BUG(); + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return value; +} + +void pci_conf_write(uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data) +{ + unsigned long flags; + + BUG_ON((offset + bytes) > 4); + + spin_lock_irqsave(&pci_config_lock, flags); + + outl(cf8, 0xcf8); + + switch ( bytes ) + { + case 1: + outb((uint8_t)data, 0xcfc + offset); + break; + case 2: + outw((uint16_t)data, 0xcfc + offset); + break; + case 4: + outl(data, 0xcfc + offset); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); +} + +uint8_t pci_conf_read8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1); +} + +uint16_t pci_conf_read16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2); +} + +uint32_t pci_conf_read32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4); +} + +void pci_conf_write8( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint8_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1, data); +} + +void pci_conf_write16( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint16_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2, data); +} + +void pci_conf_write32( + unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, + uint32_t data) +{ + BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255)); + pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data); +} diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 383a868225..9b025b51b1 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -861,6 +861,8 @@ void __init __start_xen(unsigned long mbi_p) early_boot = 0; + softirq_init(); + early_cpu_init(); paging_init(); diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index 57135940bf..ccefc50cf2 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -40,7 +40,7 @@ string_param("clocksource", opt_clocksource); unsigned long cpu_khz; /* CPU clock frequency in kHz. */ unsigned long hpet_address; DEFINE_SPINLOCK(rtc_lock); -volatile unsigned long jiffies; +unsigned long pit0_ticks; static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */ static DEFINE_SPINLOCK(wc_lock); @@ -67,19 +67,16 @@ struct platform_timesource { static DEFINE_PER_CPU(struct cpu_time, cpu_time); /* - * Protected by platform_timer_lock, which must be acquired with interrupts - * disabled because plt_overflow() is called from PIT ch0 interrupt context. - */ -static s_time_t stime_platform_stamp; -static u64 platform_timer_stamp; -static DEFINE_SPINLOCK(platform_timer_lock); - -/* - * Folding platform timer into 64-bit software counter is a really critical - * operation! We therefore do it directly in PIT ch0 interrupt handler. + * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter. + * Otherwise overflow happens too quickly (~50ms) for us to guarantee that + * softirq handling will happen in time. + * + * The pit_lock protects the 16- and 32-bit stamp fields as well as the */ -static u32 plt_overflow_jiffies; -static void plt_overflow(void); +static DEFINE_SPINLOCK(pit_lock); +static u16 pit_stamp16; +static u32 pit_stamp32; +static int using_pit; /* * 32-bit division of integer dividend and integer divisor yielding @@ -146,22 +143,36 @@ static inline u64 scale_delta(u64 delta, struct time_scale *scale) return product; } -void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) +static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) { ASSERT(local_irq_is_enabled()); - /* Update jiffies counter. */ - (*(volatile unsigned long *)&jiffies)++; + /* Only for start-of-day interruopt tests in io_apic.c. */ + (*(volatile unsigned long *)&pit0_ticks)++; /* Rough hack to allow accurate timers to sort-of-work with no APIC. */ if ( !cpu_has_apic ) raise_softirq(TIMER_SOFTIRQ); - if ( --plt_overflow_jiffies == 0 ) - plt_overflow(); + /* Emulate a 32-bit PIT counter. */ + if ( using_pit ) + { + u16 count; + + spin_lock_irq(&pit_lock); + + outb(0x80, PIT_MODE); + count = inb(PIT_CH2); + count |= inb(PIT_CH2) << 8; + + pit_stamp32 += (u16)(pit_stamp16 - count); + pit_stamp16 = count; + + spin_unlock_irq(&pit_lock); + } } -static struct irqaction irq0 = { timer_interrupt, "timer", NULL}; +static struct irqaction irq0 = { timer_interrupt, "timer", NULL }; /* ------ Calibrate the TSC ------- * Return processor ticks per second / CALIBRATE_FRAC. @@ -295,12 +306,21 @@ static char *freq_string(u64 freq) static u32 read_pit_count(void) { - u16 count; - ASSERT(spin_is_locked(&platform_timer_lock)); + u16 count16; + u32 count32; + unsigned long flags; + + spin_lock_irqsave(&pit_lock, flags); + outb(0x80, PIT_MODE); - count = inb(PIT_CH2); - count |= inb(PIT_CH2) << 8; - return ~count; + count16 = inb(PIT_CH2); + count16 |= inb(PIT_CH2) << 8; + + count32 = pit_stamp32 + (u16)(pit_stamp16 - count16); + + spin_unlock_irqrestore(&pit_lock, flags); + + return count32; } static void init_pit(struct platform_timesource *pts) @@ -308,7 +328,8 @@ static void init_pit(struct platform_timesource *pts) pts->name = "PIT"; pts->frequency = CLOCK_TICK_RATE; pts->read_counter = read_pit_count; - pts->counter_bits = 16; + pts->counter_bits = 32; + using_pit = 1; } /************************************************************ @@ -466,24 +487,28 @@ static int init_pmtimer(struct platform_timesource *pts) static struct platform_timesource plt_src; /* details of chosen timesource */ static u32 plt_mask; /* hardware-width mask */ -static u32 plt_overflow_period; /* jiffies between calls to plt_overflow() */ +static u64 plt_overflow_period; /* ns between calls to plt_overflow() */ static struct time_scale plt_scale; /* scale: platform counter -> nanosecs */ /* Protected by platform_timer_lock. */ -static u64 plt_count64; /* 64-bit platform counter stamp */ -static u32 plt_count; /* hardware-width platform counter stamp */ +static DEFINE_SPINLOCK(platform_timer_lock); +static s_time_t stime_platform_stamp; /* System time at below platform time */ +static u64 platform_timer_stamp; /* Platform time at above system time */ +static u64 plt_stamp64; /* 64-bit platform counter stamp */ +static u32 plt_stamp; /* hardware-width platform counter stamp */ +static struct timer plt_overflow_timer; -static void plt_overflow(void) +static void plt_overflow(void *unused) { u32 count; - unsigned long flags; - spin_lock_irqsave(&platform_timer_lock, flags); + spin_lock(&platform_timer_lock); count = plt_src.read_counter(); - plt_count64 += (count - plt_count) & plt_mask; - plt_count = count; - plt_overflow_jiffies = plt_overflow_period; - spin_unlock_irqrestore(&platform_timer_lock, flags); + plt_stamp64 += (count - plt_stamp) & plt_mask; + plt_stamp = count; + spin_unlock(&platform_timer_lock); + + set_timer(&plt_overflow_timer, NOW() + plt_overflow_period); } static s_time_t __read_platform_stime(u64 platform_time) @@ -497,12 +522,11 @@ static s_time_t read_platform_stime(void) { u64 count; s_time_t stime; - unsigned long flags; - spin_lock_irqsave(&platform_timer_lock, flags); - count = plt_count64 + ((plt_src.read_counter() - plt_count) & plt_mask); + spin_lock(&platform_timer_lock); + count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask); stime = __read_platform_stime(count); - spin_unlock_irqrestore(&platform_timer_lock, flags); + spin_unlock(&platform_timer_lock); return stime; } @@ -511,27 +535,25 @@ static void platform_time_calibration(void) { u64 count; s_time_t stamp; - unsigned long flags; - spin_lock_irqsave(&platform_timer_lock, flags); - count = plt_count64 + ((plt_src.read_counter() - plt_count) & plt_mask); + spin_lock(&platform_timer_lock); + count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask); stamp = __read_platform_stime(count); stime_platform_stamp = stamp; platform_timer_stamp = count; - spin_unlock_irqrestore(&platform_timer_lock, flags); + spin_unlock(&platform_timer_lock); } static void resume_platform_timer(void) { /* No change in platform_stime across suspend/resume. */ - platform_timer_stamp = plt_count64; - plt_count = plt_src.read_counter(); + platform_timer_stamp = plt_stamp64; + plt_stamp = plt_src.read_counter(); } static void init_platform_timer(void) { struct platform_timesource *pts = &plt_src; - u64 overflow_period; int rc = -1; if ( opt_clocksource[0] != '\0' ) @@ -561,13 +583,12 @@ static void init_platform_timer(void) set_time_scale(&plt_scale, pts->frequency); - overflow_period = scale_delta(1ull << (pts->counter_bits-1), &plt_scale); - do_div(overflow_period, MILLISECS(1000/HZ)); - plt_overflow_period = overflow_period; - plt_overflow(); - printk("Platform timer overflows in %d jiffies.\n", plt_overflow_period); + plt_overflow_period = scale_delta( + 1ull << (pts->counter_bits-1), &plt_scale); + init_timer(&plt_overflow_timer, plt_overflow, NULL, 0); + plt_overflow(NULL); - platform_timer_stamp = plt_count64; + platform_timer_stamp = plt_stamp64; printk("Platform timer is %s %s\n", freq_string(pts->frequency), pts->name); @@ -969,6 +990,19 @@ void __init early_time_init(void) setup_irq(0, &irq0); } +static int __init disable_pit_irq(void) +{ + if ( !using_pit && cpu_has_apic ) + { + /* Disable PIT CH0 timer interrupt. */ + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } + return 0; +} +__initcall(disable_pit_irq); + void send_timer_event(struct vcpu *v) { send_guest_vcpu_virq(v, VIRQ_TIMER); @@ -1002,6 +1036,8 @@ int time_resume(void) { u64 tmp = init_pit_and_calibrate_tsc(); + disable_pit_irq(); + set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp); resume_platform_timer(); @@ -1019,7 +1055,7 @@ int time_resume(void) int dom0_pit_access(struct ioreq *ioreq) { /* Is Xen using Channel 2? Then disallow direct dom0 access. */ - if ( plt_src.read_counter == read_pit_count ) + if ( using_pit ) return 0; switch ( ioreq->addr ) diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 019e3e56cf..5e39c9b417 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -1305,23 +1305,24 @@ static int read_gate_descriptor(unsigned int gate_sel, const struct desc_struct *pdesc; - pdesc = (const struct desc_struct *)(!(gate_sel & 4) ? - GDT_VIRT_START(v) : - LDT_VIRT_START(v)) - + (gate_sel >> 3); - if ( gate_sel < 4 || - (gate_sel >= FIRST_RESERVED_GDT_BYTE && !(gate_sel & 4)) || + pdesc = (const struct desc_struct *) + (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v)) + + (gate_sel >> 3); + if ( (gate_sel < 4) || + ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || __get_user(desc, pdesc) ) return 0; *sel = (desc.a >> 16) & 0x0000fffc; *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000); *ar = desc.b & 0x0000ffff; + /* * check_descriptor() clears the DPL field and stores the * guest requested DPL in the selector's RPL field. */ - ASSERT(!(*ar & _SEGMENT_DPL)); + if ( *ar & _SEGMENT_DPL ) + return 0; *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL; if ( !is_pv_32bit_vcpu(v) ) @@ -1352,7 +1353,7 @@ static int read_gate_descriptor(unsigned int gate_sel, #endif /* Has the guest requested sufficient permission for this I/O access? */ -static inline int guest_io_okay( +static int guest_io_okay( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { @@ -1394,19 +1395,130 @@ static inline int guest_io_okay( } /* Has the administrator granted sufficient permission for this I/O access? */ -static inline int admin_io_okay( +static int admin_io_okay( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { return ioports_access_permitted(v->domain, port, port + bytes - 1); } -#define guest_inb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r) -#define guest_inw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r) -#define guest_inl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r) -#define guest_outb_okay(_p, _d, _r) admin_io_okay(_p, 1, _d, _r) -#define guest_outw_okay(_p, _d, _r) admin_io_okay(_p, 2, _d, _r) -#define guest_outl_okay(_p, _d, _r) admin_io_okay(_p, 4, _d, _r) +static uint32_t guest_io_read( + unsigned int port, unsigned int bytes, + struct vcpu *v, struct cpu_user_regs *regs) +{ + extern uint32_t pci_conf_read( + uint32_t cf8, uint8_t offset, uint8_t bytes); + + uint32_t data = 0; + unsigned int shift = 0; + + if ( admin_io_okay(port, bytes, v, regs) ) + { + switch ( bytes ) + { + case 1: return inb(port); + case 2: return inw(port); + case 4: return inl(port); + } + } + + while ( bytes != 0 ) + { + unsigned int size = 1; + uint32_t sub_data = 0xff; + + if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) + { + sub_data = pv_pit_handler(port, 0, 0); + } + else if ( (port & 0xfffc) == 0xcf8 ) + { + size = min(bytes, 4 - (port & 3)); + sub_data = v->domain->arch.pci_cf8 >> ((port & 3) * 8); + } + else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) ) + { + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; + sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size); + } + + if ( size == 4 ) + return sub_data; + + data |= (sub_data & ((1u << (size * 8)) - 1)) << shift; + shift += size * 8; + port += size; + bytes -= size; + } + + return data; +} + +static void guest_io_write( + unsigned int port, unsigned int bytes, uint32_t data, + struct vcpu *v, struct cpu_user_regs *regs) +{ + extern void pci_conf_write( + uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data); + + if ( admin_io_okay(port, bytes, v, regs) ) + { + switch ( bytes ) { + case 1: + outb((uint8_t)data, port); + if ( pv_post_outb_hook ) + pv_post_outb_hook(port, (uint8_t)data); + break; + case 2: + outw((uint16_t)data, port); + break; + case 4: + outl(data, port); + break; + } + return; + } + + while ( bytes != 0 ) + { + unsigned int size = 1; + + if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) + { + pv_pit_handler(port, (uint8_t)data, 1); + } + else if ( (port & 0xfffc) == 0xcf8 ) + { + size = min(bytes, 4 - (port & 3)); + if ( size == 4 ) + { + v->domain->arch.pci_cf8 = data; + } + else + { + uint32_t mask = ((1u << (size * 8)) - 1) << ((port & 3) * 8); + v->domain->arch.pci_cf8 &= ~mask; + v->domain->arch.pci_cf8 |= (data << ((port & 3) * 8)) & mask; + } + } + else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) ) + { + size = min(bytes, 4 - (port & 3)); + if ( size == 3 ) + size = 2; + pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data); + } + + if ( size == 4 ) + return; + + port += size; + bytes -= size; + data >>= size * 8; + } +} /* I/O emulation support. Helper routines for, and type of, the stack stub.*/ void host_to_guest_gpr_switch(struct cpu_user_regs *) @@ -1525,7 +1637,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) /* REX prefix. */ if ( rex & 8 ) /* REX.W */ - op_bytes = 4; /* emulating only opcodes not supporting 64-bit operands */ + op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */ modrm_reg = (rex & 4) << 1; /* REX.R */ /* REX.X does not need to be decoded. */ modrm_rm = (rex & 1) << 3; /* REX.B */ @@ -1554,7 +1666,8 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) { if ( !read_descriptor(data_sel, v, regs, &data_base, &data_limit, &ar, - _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) ) + _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL| + _SEGMENT_P) ) goto fail; if ( !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || @@ -1601,69 +1714,39 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) case 0x6c: /* INSB */ op_bytes = 1; case 0x6d: /* INSW/INSL */ - if ( data_limit < op_bytes - 1 || - rd_ad(edi) > data_limit - (op_bytes - 1) || + if ( (data_limit < (op_bytes - 1)) || + (rd_ad(edi) > (data_limit - (op_bytes - 1))) || !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - switch ( op_bytes ) - { - case 1: - /* emulate PIT counter 2 */ - data = (u8)(guest_inb_okay(port, v, regs) ? inb(port) : - ((port == 0x42 || port == 0x43 || port == 0x61) ? - pv_pit_handler(port, 0, 0) : ~0)); - break; - case 2: - data = (u16)(guest_inw_okay(port, v, regs) ? inw(port) : ~0); - break; - case 4: - data = (u32)(guest_inl_okay(port, v, regs) ? inl(port) : ~0); - break; - } - if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 ) + data = guest_io_read(port, op_bytes, v, regs); + if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), + &data, op_bytes)) != 0 ) { propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc, PFEC_write_access); return EXCRET_fault_fixed; } - wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes)); + wr_ad(edi, regs->edi + (int)((regs->eflags & EF_DF) + ? -op_bytes : op_bytes)); break; case 0x6e: /* OUTSB */ op_bytes = 1; case 0x6f: /* OUTSW/OUTSL */ - if ( data_limit < op_bytes - 1 || - rd_ad(esi) > data_limit - (op_bytes - 1) || - !guest_io_okay(port, op_bytes, v, regs) ) + if ( (data_limit < (op_bytes - 1)) || + (rd_ad(esi) > (data_limit - (op_bytes - 1))) || + !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes); - if ( rc != 0 ) + if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), + op_bytes)) != 0 ) { - propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0); + propagate_page_fault(data_base + rd_ad(esi) + + op_bytes - rc, 0); return EXCRET_fault_fixed; } - switch ( op_bytes ) - { - case 1: - if ( guest_outb_okay(port, v, regs) ) - { - outb((u8)data, port); - if ( pv_post_outb_hook ) - pv_post_outb_hook(port, data); - } - else if ( port == 0x42 || port == 0x43 || port == 0x61 ) - pv_pit_handler(port, data, 1); - break; - case 2: - if ( guest_outw_okay(port, v, regs) ) - outw((u16)data, port); - break; - case 4: - if ( guest_outl_okay(port, v, regs) ) - outl((u32)data, port); - break; - } - wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) ? -op_bytes : op_bytes)); + guest_io_write(port, op_bytes, data, v, regs); + wr_ad(esi, regs->esi + (int)((regs->eflags & EF_DF) + ? -op_bytes : op_bytes)); break; } @@ -1727,31 +1810,17 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) exec_in: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - switch ( op_bytes ) + if ( admin_io_okay(port, op_bytes, v, regs) ) { - case 1: - if ( guest_inb_okay(port, v, regs) ) - io_emul(regs); - else if ( port == 0x42 || port == 0x43 || port == 0x61 ) - { - regs->eax &= ~0xffUL; - regs->eax |= pv_pit_handler(port, 0, 0); - } - else - regs->eax |= (u8)~0; - break; - case 2: - if ( guest_inw_okay(port, v, regs) ) - io_emul(regs); - else - regs->eax |= (u16)~0; - break; - case 4: - if ( guest_inl_okay(port, v, regs) ) - io_emul(regs); + io_emul(regs); + } + else + { + if ( op_bytes == 4 ) + regs->eax = 0; else - regs->eax = (u32)~0; - break; + regs->eax &= ~((1u << (op_bytes * 8)) - 1); + regs->eax |= guest_io_read(port, op_bytes, v, regs); } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); goto done; @@ -1770,26 +1839,15 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) exec_out: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; - switch ( op_bytes ) + if ( admin_io_okay(port, op_bytes, v, regs) ) { - case 1: - if ( guest_outb_okay(port, v, regs) ) - { - io_emul(regs); - if ( pv_post_outb_hook ) - pv_post_outb_hook(port, regs->eax); - } - else if ( port == 0x42 || port == 0x43 || port == 0x61 ) - pv_pit_handler(port, regs->eax, 1); - break; - case 2: - if ( guest_outw_okay(port, v, regs) ) - io_emul(regs); - break; - case 4: - if ( guest_outl_okay(port, v, regs) ) - io_emul(regs); - break; + io_emul(regs); + if ( (op_bytes == 1) && pv_post_outb_hook ) + pv_post_outb_hook(port, regs->eax); + } + else + { + guest_io_write(port, op_bytes, regs->eax, v, regs); } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); goto done; @@ -1921,14 +1979,14 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) break; case 3: /* Write CR3 */ - LOCK_BIGLOCK(v->domain); + domain_lock(v->domain); if ( !is_pv_32on64_vcpu(v) ) rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg))); #ifdef CONFIG_COMPAT else rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg))); #endif - UNLOCK_BIGLOCK(v->domain); + domain_unlock(v->domain); if ( rc == 0 ) /* not okay */ goto fail; break; @@ -2137,8 +2195,8 @@ static void emulate_gate_op(struct cpu_user_regs *regs) /* Check whether this fault is due to the use of a call gate. */ if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) || - ((ar >> 13) & 3) < (regs->cs & 3) || - (ar & _SEGMENT_TYPE) != 0xc00 ) + (((ar >> 13) & 3) < (regs->cs & 3)) || + ((ar & _SEGMENT_TYPE) != 0xc00) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; @@ -2232,15 +2290,18 @@ static void emulate_gate_op(struct cpu_user_regs *regs) { if ( (modrm & 7) == 4 ) { - unsigned int sib = insn_fetch(u8, base, eip, limit); + unsigned int sib; + sib = insn_fetch(u8, base, eip, limit); modrm = (modrm & ~7) | (sib & 7); if ( (sib >>= 3) != 4 ) - opnd_off = *(unsigned long *)decode_register(sib & 7, regs, 0); + opnd_off = *(unsigned long *) + decode_register(sib & 7, regs, 0); opnd_off <<= sib >> 3; } if ( (modrm & 7) != 5 || (modrm & 0xc0) ) - opnd_off += *(unsigned long *)decode_register(modrm & 7, regs, 0); + opnd_off += *(unsigned long *) + decode_register(modrm & 7, regs, 0); else modrm |= 0x87; if ( !opnd_sel ) @@ -2576,12 +2637,14 @@ asmlinkage void do_general_protection(struct cpu_user_regs *regs) panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code); } -static void nmi_softirq(void) +static void nmi_action(unsigned long unused) { /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */ vcpu_kick(dom0->vcpu[0]); } +static DECLARE_TASKLET(nmi_tasklet, nmi_action, 0); + static void nmi_dom0_report(unsigned int reason_idx) { struct domain *d; @@ -2593,7 +2656,7 @@ static void nmi_dom0_report(unsigned int reason_idx) set_bit(reason_idx, nmi_reason(d)); if ( !test_and_set_bool(v->nmi_pending) ) - raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */ + tasklet_schedule(&nmi_tasklet); /* not safe to wake a vcpu here */ } asmlinkage void mem_parity_error(struct cpu_user_regs *regs) @@ -2871,8 +2934,6 @@ void __init trap_init(void) percpu_traps_init(); cpu_init(); - - open_softirq(NMI_SOFTIRQ, nmi_softirq); } long register_guest_nmi_callback(unsigned long address) diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c index 256f7a5ac8..a1de1bab27 100644 --- a/xen/arch/x86/x86_64/compat/mm.c +++ b/xen/arch/x86/x86_64/compat/mm.c @@ -28,12 +28,12 @@ int compat_set_gdt(XEN_GUEST_HANDLE(uint) frame_list, unsigned int entries) guest_handle_add_offset(frame_list, 1); } - LOCK_BIGLOCK(current->domain); + domain_lock(current->domain); if ( (ret = set_gdt(current, frames, entries)) == 0 ) flush_tlb_local(); - UNLOCK_BIGLOCK(current->domain); + domain_unlock(current->domain); return ret; } diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index f9f33e0a88..3d79657989 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -59,7 +59,7 @@ void *alloc_xen_pagetable(void) if ( !early_boot ) { - struct page_info *pg = alloc_domheap_page(NULL); + struct page_info *pg = alloc_domheap_page(NULL, 0); BUG_ON(pg == NULL); return page_to_virt(pg); } @@ -108,7 +108,7 @@ void __init paging_init(void) struct page_info *l1_pg, *l2_pg, *l3_pg; /* Create user-accessible L2 directory to map the MPT for guests. */ - if ( (l3_pg = alloc_domheap_page(NULL)) == NULL ) + if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL ) goto nomem; l3_ro_mpt = page_to_virt(l3_pg); clear_page(l3_ro_mpt); @@ -134,7 +134,7 @@ void __init paging_init(void) 1UL << L2_PAGETABLE_SHIFT); if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) ) { - if ( (l2_pg = alloc_domheap_page(NULL)) == NULL ) + if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL ) goto nomem; va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT); l2_ro_mpt = page_to_virt(l2_pg); @@ -154,7 +154,7 @@ void __init paging_init(void) l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)); l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset( HIRO_COMPAT_MPT_VIRT_START)]); - if ( (l2_pg = alloc_domheap_page(NULL)) == NULL ) + if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL ) goto nomem; compat_idle_pg_table_l2 = l2_ro_mpt = page_to_virt(l2_pg); clear_page(l2_ro_mpt); |