diff options
Diffstat (limited to 'xen/arch/x86')
34 files changed, 919 insertions, 549 deletions
diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c index 1a3d5f591e..7eb6000b37 100644 --- a/xen/arch/x86/apic.c +++ b/xen/arch/x86/apic.c @@ -870,7 +870,7 @@ void enable_APIC_timer(void) * returns 1 on success * returns 0 if the timeout value is too small or in the past. */ -int reprogram_ac_timer(s_time_t timeout) +int reprogram_timer(s_time_t timeout) { s_time_t now; s_time_t expire; @@ -931,7 +931,7 @@ void smp_apic_timer_interrupt(struct cpu_user_regs * regs) { ack_APIC_irq(); perfc_incrc(apic_timer); - raise_softirq(AC_TIMER_SOFTIRQ); + raise_softirq(TIMER_SOFTIRQ); } /* diff --git a/xen/arch/x86/boot/x86_32.S b/xen/arch/x86/boot/x86_32.S index b98e1c72bc..5534b2621b 100644 --- a/xen/arch/x86/boot/x86_32.S +++ b/xen/arch/x86/boot/x86_32.S @@ -100,7 +100,7 @@ __start: 1: stosl /* low mappings cover as much physmem as possible */ add $4,%edi add $(1<<L2_PAGETABLE_SHIFT),%eax - cmp $__HYPERVISOR_VIRT_START+0xe3,%eax + cmp $HYPERVISOR_VIRT_START+0xe3,%eax jne 1b #else /* Initialize low and high mappings of all memory with 4MB pages */ @@ -113,7 +113,7 @@ __start: jne 1b 1: stosl /* low mappings cover as much physmem as possible */ add $(1<<L2_PAGETABLE_SHIFT),%eax - cmp $__HYPERVISOR_VIRT_START+0xe3,%eax + cmp $HYPERVISOR_VIRT_START+0xe3,%eax jne 1b #endif diff --git a/xen/arch/x86/dm/i8259.c b/xen/arch/x86/dm/i8259.c index c0d735dc52..8a27835e9f 100644 --- a/xen/arch/x86/dm/i8259.c +++ b/xen/arch/x86/dm/i8259.c @@ -29,7 +29,7 @@ #include <xen/lib.h> #include <xen/errno.h> #include <xen/sched.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx.h> #include <asm/vmx_vpic.h> #include <asm/current.h> diff --git a/xen/arch/x86/dm/vmx_vioapic.c b/xen/arch/x86/dm/vmx_vioapic.c index 769eb59f22..201788e858 100644 --- a/xen/arch/x86/dm/vmx_vioapic.c +++ b/xen/arch/x86/dm/vmx_vioapic.c @@ -37,7 +37,7 @@ #include <xen/lib.h> #include <xen/errno.h> #include <xen/sched.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx.h> #include <asm/vmx_vpic.h> #include <asm/current.h> diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c index 5a4f493ce0..1ee7efd37b 100644 --- a/xen/arch/x86/dom0_ops.c +++ b/xen/arch/x86/dom0_ops.c @@ -36,13 +36,13 @@ static unsigned long msr_hi; static void write_msr_for(void *unused) { - if ( ((1 << current->processor) & msr_cpu_mask) ) + if ( ((1 << smp_processor_id()) & msr_cpu_mask) ) (void)wrmsr_user(msr_addr, msr_lo, msr_hi); } static void read_msr_for(void *unused) { - if ( ((1 << current->processor) & msr_cpu_mask) ) + if ( ((1 << smp_processor_id()) & msr_cpu_mask) ) (void)rdmsr_user(msr_addr, msr_lo, msr_hi); } @@ -103,12 +103,27 @@ long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op) op->u.add_memtype.nr_pfns, op->u.add_memtype.type, 1); + if (ret > 0) + { + (void)__put_user(0, &u_dom0_op->u.add_memtype.handle); + (void)__put_user(ret, &u_dom0_op->u.add_memtype.reg); + ret = 0; + } } break; case DOM0_DEL_MEMTYPE: { - ret = mtrr_del_page(op->u.del_memtype.reg, 0, 0); + if (op->u.del_memtype.handle == 0 + /* mtrr/main.c otherwise does a lookup */ + && (int)op->u.del_memtype.reg >= 0) + { + ret = mtrr_del_page(op->u.del_memtype.reg, 0, 0); + if (ret > 0) + ret = 0; + } + else + ret = -EINVAL; } break; @@ -179,7 +194,7 @@ long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op) memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4); ret = 0; if ( copy_to_user(u_dom0_op, op, sizeof(*op)) ) - ret = -EFAULT; + ret = -EFAULT; } break; diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index d905f9dfbf..19c29d084c 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -46,17 +46,16 @@ boolean_param("noreboot", opt_noreboot); struct percpu_ctxt { struct vcpu *curr_vcpu; - unsigned int context_not_finalised; unsigned int dirty_segment_mask; } __cacheline_aligned; static struct percpu_ctxt percpu_ctxt[NR_CPUS]; -static void continue_idle_task(struct vcpu *v) +static void continue_idle_domain(struct vcpu *v) { reset_stack_and_jump(idle_loop); } -static void continue_nonidle_task(struct vcpu *v) +static void continue_nonidle_domain(struct vcpu *v) { reset_stack_and_jump(ret_from_intr); } @@ -92,10 +91,9 @@ void startup_cpu_idle_loop(void) { struct vcpu *v = current; - ASSERT(is_idle_task(v->domain)); - percpu_ctxt[smp_processor_id()].curr_vcpu = v; - cpu_set(smp_processor_id(), v->domain->cpumask); - v->arch.schedule_tail = continue_idle_task; + ASSERT(is_idle_vcpu(v)); + cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask); + cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask); reset_stack_and_jump(idle_loop); } @@ -217,14 +215,20 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id) memset(v, 0, sizeof(*v)); - memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch)); + memcpy(&v->arch, &idle_vcpu[0]->arch, sizeof(v->arch)); v->arch.flags = TF_kernel_mode; + if ( is_idle_domain(d) ) + { + percpu_ctxt[vcpu_id].curr_vcpu = v; + v->arch.schedule_tail = continue_idle_domain; + } + if ( (v->vcpu_id = vcpu_id) != 0 ) { v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail; v->arch.perdomain_ptes = - d->arch.mm_perdomain_pt + (vcpu_id << PDPT_VCPU_SHIFT); + d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT); } return v; @@ -259,32 +263,11 @@ int arch_do_createdomain(struct vcpu *v) int i; #endif - if ( is_idle_task(d) ) - return 0; - - d->arch.ioport_caps = - rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); - if ( d->arch.ioport_caps == NULL ) - return -ENOMEM; - - if ( (d->shared_info = alloc_xenheap_page()) == NULL ) - return -ENOMEM; - - if ( (rc = ptwr_init(d)) != 0 ) - { - free_xenheap_page(d->shared_info); - return rc; - } - - v->arch.schedule_tail = continue_nonidle_task; - - memset(d->shared_info, 0, PAGE_SIZE); - v->vcpu_info = &d->shared_info->vcpu_info[v->vcpu_id]; - v->cpumap = CPUMAP_RUNANYWHERE; - SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d); - pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)); d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order); + if ( d->arch.mm_perdomain_pt == NULL ) + goto fail_nomem; + memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order); v->arch.perdomain_ptes = d->arch.mm_perdomain_pt; @@ -297,49 +280,73 @@ int arch_do_createdomain(struct vcpu *v) */ gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ ) - d->arch.mm_perdomain_pt[ - (vcpuid << PDPT_VCPU_SHIFT) + FIRST_RESERVED_GDT_PAGE] = gdt_l1e; + d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) + + FIRST_RESERVED_GDT_PAGE)] = gdt_l1e; v->arch.guest_vtable = __linear_l2_table; v->arch.shadow_vtable = __shadow_linear_l2_table; -#ifdef __x86_64__ +#if defined(__i386__) + + mapcache_init(d); + +#else /* __x86_64__ */ + v->arch.guest_vl3table = __linear_l3_table; v->arch.guest_vl4table = __linear_l4_table; d->arch.mm_perdomain_l2 = alloc_xenheap_page(); + d->arch.mm_perdomain_l3 = alloc_xenheap_page(); + if ( (d->arch.mm_perdomain_l2 == NULL) || + (d->arch.mm_perdomain_l3 == NULL) ) + goto fail_nomem; + memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE); for ( i = 0; i < (1 << pdpt_order); i++ ) d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] = l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i, __PAGE_HYPERVISOR); - d->arch.mm_perdomain_l3 = alloc_xenheap_page(); memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE); d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] = l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2), __PAGE_HYPERVISOR); -#endif + +#endif /* __x86_64__ */ shadow_lock_init(d); INIT_LIST_HEAD(&d->arch.free_shadow_frames); - return 0; -} + if ( !is_idle_domain(d) ) + { + d->arch.ioport_caps = + rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); + if ( d->arch.ioport_caps == NULL ) + goto fail_nomem; -void vcpu_migrate_cpu(struct vcpu *v, int newcpu) -{ - if ( v->processor == newcpu ) - return; + if ( (d->shared_info = alloc_xenheap_page()) == NULL ) + goto fail_nomem; - set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags); - v->processor = newcpu; + if ( (rc = ptwr_init(d)) != 0 ) + goto fail_nomem; - if ( VMX_DOMAIN(v) ) - { - __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs)); - v->arch.schedule_tail = arch_vmx_do_relaunch; + memset(d->shared_info, 0, PAGE_SIZE); + v->vcpu_info = &d->shared_info->vcpu_info[v->vcpu_id]; + SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d); + + v->arch.schedule_tail = continue_nonidle_domain; } + + return 0; + + fail_nomem: + free_xenheap_page(d->shared_info); +#ifdef __x86_64__ + free_xenheap_page(d->arch.mm_perdomain_l2); + free_xenheap_page(d->arch.mm_perdomain_l3); +#endif + free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order); + return -ENOMEM; } /* This is called by arch_final_setup_guest and do_boot_vcpu */ @@ -473,14 +480,6 @@ void new_thread(struct vcpu *d, #ifdef __x86_64__ -void toggle_guest_mode(struct vcpu *v) -{ - v->arch.flags ^= TF_kernel_mode; - __asm__ __volatile__ ( "swapgs" ); - update_pagetables(v); - write_ptbase(v); -} - #define loadsegment(seg,value) ({ \ int __r = 1; \ __asm__ __volatile__ ( \ @@ -650,35 +649,6 @@ static void save_segments(struct vcpu *v) percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask; } -long do_switch_to_user(void) -{ - struct cpu_user_regs *regs = guest_cpu_user_regs(); - struct switch_to_user stu; - struct vcpu *v = current; - - if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) || - unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) ) - return -EFAULT; - - toggle_guest_mode(v); - - regs->rip = stu.rip; - regs->cs = stu.cs | 3; /* force guest privilege */ - regs->rflags = (stu.rflags & ~(EF_IOPL|EF_VM)) | EF_IE; - regs->rsp = stu.rsp; - regs->ss = stu.ss | 3; /* force guest privilege */ - - if ( !(stu.flags & VGCF_IN_SYSCALL) ) - { - regs->entry_vector = 0; - regs->r11 = stu.r11; - regs->rcx = stu.rcx; - } - - /* Saved %rax gets written back to regs->rax in entry.S. */ - return stu.rax; -} - #define switch_kernel_stack(_n,_c) ((void)0) #elif defined(__i386__) @@ -705,7 +675,10 @@ static void __context_switch(void) struct vcpu *p = percpu_ctxt[cpu].curr_vcpu; struct vcpu *n = current; - if ( !is_idle_task(p->domain) ) + ASSERT(p != n); + ASSERT(cpus_empty(n->vcpu_dirty_cpumask)); + + if ( !is_idle_vcpu(p) ) { memcpy(&p->arch.guest_context.user_regs, stack_regs, @@ -714,7 +687,7 @@ static void __context_switch(void) save_segments(p); } - if ( !is_idle_task(n->domain) ) + if ( !is_idle_vcpu(n) ) { memcpy(stack_regs, &n->arch.guest_context.user_regs, @@ -740,7 +713,8 @@ static void __context_switch(void) } if ( p->domain != n->domain ) - cpu_set(cpu, n->domain->cpumask); + cpu_set(cpu, n->domain->domain_dirty_cpumask); + cpu_set(cpu, n->vcpu_dirty_cpumask); write_ptbase(n); @@ -753,7 +727,8 @@ static void __context_switch(void) } if ( p->domain != n->domain ) - cpu_clear(cpu, p->domain->cpumask); + cpu_clear(cpu, p->domain->domain_dirty_cpumask); + cpu_clear(cpu, p->vcpu_dirty_cpumask); percpu_ctxt[cpu].curr_vcpu = n; } @@ -762,29 +737,32 @@ static void __context_switch(void) void context_switch(struct vcpu *prev, struct vcpu *next) { unsigned int cpu = smp_processor_id(); + cpumask_t dirty_mask = next->vcpu_dirty_cpumask; - ASSERT(!local_irq_is_enabled()); - - set_current(next); + ASSERT(local_irq_is_enabled()); - if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) ) + /* Allow at most one CPU at a time to be dirty. */ + ASSERT(cpus_weight(dirty_mask) <= 1); + if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) ) { - __context_switch(); - percpu_ctxt[cpu].context_not_finalised = 1; + /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ + flush_tlb_mask(dirty_mask); } -} -void context_switch_finalise(struct vcpu *next) -{ - unsigned int cpu = smp_processor_id(); + local_irq_disable(); - ASSERT(local_irq_is_enabled()); + set_current(next); - if ( percpu_ctxt[cpu].context_not_finalised ) + if ( (percpu_ctxt[cpu].curr_vcpu == next) || is_idle_vcpu(next) ) + { + local_irq_enable(); + } + else { - percpu_ctxt[cpu].context_not_finalised = 0; + __context_switch(); - BUG_ON(percpu_ctxt[cpu].curr_vcpu != next); + /* Re-enable interrupts before restoring state which may fault. */ + local_irq_enable(); if ( VMX_DOMAIN(next) ) { @@ -798,6 +776,8 @@ void context_switch_finalise(struct vcpu *next) } } + context_saved(prev); + schedule_tail(next); BUG(); } @@ -827,20 +807,11 @@ int __sync_lazy_execstate(void) void sync_vcpu_execstate(struct vcpu *v) { - unsigned int cpu = v->processor; - - if ( !cpu_isset(cpu, v->domain->cpumask) ) - return; - - if ( cpu == smp_processor_id() ) - { + if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) ) (void)__sync_lazy_execstate(); - } - else - { - /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ - flush_tlb_mask(cpumask_of_cpu(cpu)); - } + + /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ + flush_tlb_mask(v->vcpu_dirty_cpumask); } unsigned long __hypercall_create_continuation( @@ -966,7 +937,7 @@ void domain_relinquish_resources(struct domain *d) struct vcpu *v; unsigned long pfn; - BUG_ON(!cpus_empty(d->cpumask)); + BUG_ON(!cpus_empty(d->domain_dirty_cpumask)); ptwr_destroy(d); diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index d08f2c12fb..84d84a66cf 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -366,27 +366,20 @@ int construct_dom0(struct domain *d, l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] = l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR); } - { - unsigned long va; - for (va = PERDOMAIN_VIRT_START; va < PERDOMAIN_VIRT_END; - va += (1 << L2_PAGETABLE_SHIFT)) { - l2tab[va >> L2_PAGETABLE_SHIFT] = - l2e_from_paddr(__pa(d->arch.mm_perdomain_pt) + - (va-PERDOMAIN_VIRT_START), - __PAGE_HYPERVISOR); - } - } v->arch.guest_table = mk_pagetable((unsigned long)l3start); #else l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE); l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR); - l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR); v->arch.guest_table = mk_pagetable((unsigned long)l2start); #endif + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] = + l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); + l2tab += l2_linear_offset(dsi.v_start); mfn = alloc_spfn; for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) diff --git a/xen/arch/x86/idle0_task.c b/xen/arch/x86/idle0_task.c deleted file mode 100644 index b876c619ef..0000000000 --- a/xen/arch/x86/idle0_task.c +++ /dev/null @@ -1,27 +0,0 @@ - -#include <xen/config.h> -#include <xen/sched.h> -#include <asm/desc.h> - -struct domain idle0_domain = { - domain_id: IDLE_DOMAIN_ID, - domain_flags:DOMF_idle_domain, - refcnt: ATOMIC_INIT(1) -}; - -struct vcpu idle0_vcpu = { - processor: 0, - domain: &idle0_domain -}; - -struct tss_struct init_tss[NR_CPUS]; - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c index 7dd6bd590a..841bd10a03 100644 --- a/xen/arch/x86/io_apic.c +++ b/xen/arch/x86/io_apic.c @@ -1807,3 +1807,47 @@ int ioapic_guest_write(int apicid, int address, u32 val) return 0; } + +void dump_ioapic_irq_info(void) +{ + struct irq_pin_list *entry; + struct IO_APIC_route_entry rte; + unsigned int irq, pin, printed = 0; + unsigned long flags; + + for ( irq = 0; irq < NR_IRQS; irq++ ) + { + entry = &irq_2_pin[irq]; + if ( entry->pin == -1 ) + continue; + + if ( !printed++ ) + printk("IO-APIC interrupt information:\n"); + + printk(" IRQ%3d Vec%3d:\n", irq, irq_to_vector(irq)); + + for ( ; ; ) + { + pin = entry->pin; + + printk(" Apic 0x%02x, Pin %2d: ", entry->apic, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&rte) + 0) = io_apic_read(entry->apic, 0x10 + 2 * pin); + *(((int *)&rte) + 1) = io_apic_read(entry->apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("vector=%u, delivery_mode=%u, dest_mode=%s, " + "delivery_status=%d, polarity=%d, irr=%d, " + "trigger=%s, mask=%d\n", + rte.vector, rte.delivery_mode, + rte.dest_mode ? "logical" : "physical", + rte.delivery_status, rte.polarity, rte.irr, + rte.trigger ? "level" : "edge", rte.mask); + + if ( entry->next == 0 ) + break; + entry = &irq_2_pin[entry->next]; + } + } +} diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c index a1aee360c3..d81d8749a6 100644 --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -12,6 +12,7 @@ #include <xen/irq.h> #include <xen/perfc.h> #include <xen/sched.h> +#include <xen/keyhandler.h> #include <asm/current.h> #include <asm/smpboot.h> @@ -198,15 +199,21 @@ int pirq_guest_unmask(struct domain *d) int pirq_guest_bind(struct vcpu *v, int irq, int will_share) { - unsigned int vector = irq_to_vector(irq); - irq_desc_t *desc = &irq_desc[vector]; + unsigned int vector; + irq_desc_t *desc; irq_guest_action_t *action; unsigned long flags; int rc = 0; cpumask_t cpumask = CPU_MASK_NONE; + if ( (irq < 0) || (irq >= NR_IRQS) ) + return -EINVAL; + + vector = irq_to_vector(irq); if ( vector == 0 ) - return -EBUSY; + return -EINVAL; + + desc = &irq_desc[vector]; spin_lock_irqsave(&desc->lock, flags); @@ -305,3 +312,71 @@ int pirq_guest_unbind(struct domain *d, int irq) spin_unlock_irqrestore(&desc->lock, flags); return 0; } + +extern void dump_ioapic_irq_info(void); + +static void dump_irqs(unsigned char key) +{ + int i, irq, vector; + irq_desc_t *desc; + irq_guest_action_t *action; + struct domain *d; + unsigned long flags; + + printk("Guest interrupt information:\n"); + + for ( irq = 0; irq < NR_IRQS; irq++ ) + { + vector = irq_to_vector(irq); + if ( vector == 0 ) + continue; + + desc = &irq_desc[vector]; + + spin_lock_irqsave(&desc->lock, flags); + + if ( desc->status & IRQ_GUEST ) + { + action = (irq_guest_action_t *)desc->action; + + printk(" IRQ%3d Vec%3d: type=%-15s status=%08x " + "in-flight=%d domain-list=", + irq, vector, desc->handler->typename, + desc->status, action->in_flight); + + for ( i = 0; i < action->nr_guests; i++ ) + { + d = action->guest[i]; + printk("%u(%c%c%c%c)", + d->domain_id, + (test_bit(d->pirq_to_evtchn[irq], + &d->shared_info->evtchn_pending[0]) ? + 'P' : '-'), + (test_bit(d->pirq_to_evtchn[irq]/BITS_PER_LONG, + &d->shared_info->vcpu_info[0]. + evtchn_pending_sel) ? + 'S' : '-'), + (test_bit(d->pirq_to_evtchn[irq], + &d->shared_info->evtchn_mask[0]) ? + 'M' : '-'), + (test_bit(irq, &d->pirq_mask) ? + 'M' : '-')); + if ( i != action->nr_guests ) + printk(","); + } + + printk("\n"); + } + + spin_unlock_irqrestore(&desc->lock, flags); + } + + dump_ioapic_irq_info(); +} + +static int __init setup_dump_irqs(void) +{ + register_keyhandler('i', dump_irqs, "dump interrupt bindings"); + return 0; +} +__initcall(setup_dump_irqs); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 683c4b7534..79da37d3ea 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -297,7 +297,6 @@ int map_ldt_shadow_page(unsigned int off) #if defined(__x86_64__) /* If in user mode, switch to kernel mode just to read LDT mapping. */ - extern void toggle_guest_mode(struct vcpu *); int user_mode = !(v->arch.flags & TF_kernel_mode); #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) #elif defined(__i386__) @@ -841,10 +840,11 @@ static int alloc_l2_table(struct pfn_info *page, unsigned long type) L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_from_pfn(pfn, __PAGE_HYPERVISOR); - pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = - l2e_from_page( - virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt), - __PAGE_HYPERVISOR); + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = + l2e_from_page( + virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); #endif unmap_domain_page(pl2e); @@ -1457,7 +1457,8 @@ int get_page_type(struct pfn_info *page, unsigned long type) * was GDT/LDT) but those circumstances should be * very rare. */ - cpumask_t mask = page_get_owner(page)->cpumask; + cpumask_t mask = + page_get_owner(page)->domain_dirty_cpumask; tlbflush_filter(mask, page->tlbflush_timestamp); if ( unlikely(!cpus_empty(mask)) ) @@ -1619,7 +1620,7 @@ static void process_deferred_ops(unsigned int cpu) if ( shadow_mode_enabled(d) ) shadow_sync_all(d); if ( deferred_ops & DOP_FLUSH_ALL_TLBS ) - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); else local_flush_tlb(); } @@ -1691,7 +1692,7 @@ static inline cpumask_t vcpumask_to_pcpumask( struct domain *d, unsigned long vmask) { unsigned int vcpu_id; - cpumask_t pmask; + cpumask_t pmask = CPU_MASK_NONE; struct vcpu *v; while ( vmask != 0 ) @@ -1700,7 +1701,7 @@ static inline cpumask_t vcpumask_to_pcpumask( vmask &= ~(1UL << vcpu_id); if ( (vcpu_id < MAX_VIRT_CPUS) && ((v = d->vcpu[vcpu_id]) != NULL) ) - cpu_set(v->processor, pmask); + cpus_or(pmask, pmask, v->vcpu_dirty_cpumask); } return pmask; @@ -1869,7 +1870,6 @@ int do_mmuext_op( break; } pmask = vcpumask_to_pcpumask(d, vmask); - cpus_and(pmask, pmask, d->cpumask); if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI ) flush_tlb_mask(pmask); else @@ -1878,11 +1878,11 @@ int do_mmuext_op( } case MMUEXT_TLB_FLUSH_ALL: - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); break; case MMUEXT_INVLPG_ALL: - flush_tlb_one_mask(d->cpumask, op.arg1.linear_addr); + flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr); break; case MMUEXT_FLUSH_CACHE: @@ -2497,7 +2497,7 @@ int do_update_va_mapping(unsigned long va, u64 val64, l1_pgentry_t val = l1e_from_intpte(val64); struct vcpu *v = current; struct domain *d = v->domain; - unsigned int cpu = v->processor; + unsigned int cpu = smp_processor_id(); unsigned long vmask, bmap_ptr; cpumask_t pmask; int rc = 0; @@ -2548,13 +2548,12 @@ int do_update_va_mapping(unsigned long va, u64 val64, local_flush_tlb(); break; case UVMF_ALL: - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); break; default: if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) ) rc = -EFAULT; pmask = vcpumask_to_pcpumask(d, vmask); - cpus_and(pmask, pmask, d->cpumask); flush_tlb_mask(pmask); break; } @@ -2569,13 +2568,12 @@ int do_update_va_mapping(unsigned long va, u64 val64, local_flush_tlb_one(va); break; case UVMF_ALL: - flush_tlb_one_mask(d->cpumask, va); + flush_tlb_one_mask(d->domain_dirty_cpumask, va); break; default: if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) ) rc = -EFAULT; pmask = vcpumask_to_pcpumask(d, vmask); - cpus_and(pmask, pmask, d->cpumask); flush_tlb_one_mask(pmask, va); break; } @@ -2972,7 +2970,6 @@ void ptwr_flush(struct domain *d, const int which) #ifdef CONFIG_X86_64 struct vcpu *v = current; - extern void toggle_guest_mode(struct vcpu *); int user_mode = !(v->arch.flags & TF_kernel_mode); #endif @@ -3002,7 +2999,7 @@ void ptwr_flush(struct domain *d, const int which) BUG(); } PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n", - PTWR_PRINT_WHICH, ptep, pte.l1); + PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte)); l1e_remove_flags(pte, _PAGE_RW); /* Write-protect the p.t. page in the guest page table. */ @@ -3018,20 +3015,33 @@ void ptwr_flush(struct domain *d, const int which) /* Ensure that there are no stale writable mappings in any TLB. */ /* NB. INVLPG is a serialising instruction: flushes pending updates. */ - flush_tlb_one_mask(d->cpumask, l1va); + flush_tlb_one_mask(d->domain_dirty_cpumask, l1va); PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n", - PTWR_PRINT_WHICH, ptep, pte.l1); + PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte)); /* * STEP 2. Validate any modified PTEs. */ - pl1e = d->arch.ptwr[which].pl1e; - modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page); - unmap_domain_page(pl1e); - perfc_incr_histo(wpt_updates, modified, PT_UPDATES); - ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified); - d->arch.ptwr[which].prev_nr_updates = modified; + if ( likely(d == current->domain) ) + { + pl1e = map_domain_page(l1e_get_pfn(pte)); + modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page); + unmap_domain_page(pl1e); + perfc_incr_histo(wpt_updates, modified, PT_UPDATES); + ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified); + d->arch.ptwr[which].prev_nr_updates = modified; + } + else + { + /* + * Must make a temporary global mapping, since we are running in the + * wrong address space, so no access to our own mapcache. + */ + pl1e = map_domain_page_global(l1e_get_pfn(pte)); + modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page); + unmap_domain_page_global(pl1e); + } /* * STEP 3. Reattach the L1 p.t. page into the current address space. @@ -3209,7 +3219,7 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr, { unsigned long pfn; struct pfn_info *page; - l1_pgentry_t pte; + l1_pgentry_t *pl1e, pte; l2_pgentry_t *pl2e, l2e; int which, flags; unsigned long l2_idx; @@ -3342,15 +3352,14 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr, if ( which == PTWR_PT_ACTIVE ) { l2e_remove_flags(*pl2e, _PAGE_PRESENT); - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); } /* Temporarily map the L1 page, and make a copy of it. */ - d->arch.ptwr[which].pl1e = map_domain_page(pfn); - memcpy(d->arch.ptwr[which].page, - d->arch.ptwr[which].pl1e, - L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t)); - + pl1e = map_domain_page(pfn); + memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE); + unmap_domain_page(pl1e); + /* Finally, make the p.t. page writable by the guest OS. */ l1e_add_flags(pte, _PAGE_RW); if ( unlikely(__put_user(pte.l1, @@ -3359,7 +3368,6 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr, MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *) &linear_pg_table[l1_linear_offset(addr)]); /* Toss the writable pagetable state and crash. */ - unmap_domain_page(d->arch.ptwr[which].pl1e); d->arch.ptwr[which].l1va = 0; domain_crash(d); return 0; @@ -3369,7 +3377,7 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr, emulate: if ( x86_emulate_memop(guest_cpu_user_regs(), addr, - &ptwr_mem_emulator, BITS_PER_LONG/8) ) + &ptwr_mem_emulator, X86EMUL_MODE_HOST) ) return 0; perfc_incrc(ptwr_emulations); return EXCRET_fault_fixed; diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c index b63036ac54..96c55572cd 100644 --- a/xen/arch/x86/nmi.c +++ b/xen/arch/x86/nmi.c @@ -23,18 +23,20 @@ #include <xen/sched.h> #include <xen/console.h> #include <xen/smp.h> +#include <xen/keyhandler.h> #include <asm/current.h> #include <asm/mc146818rtc.h> #include <asm/msr.h> #include <asm/mpspec.h> #include <asm/debugger.h> #include <asm/div64.h> +#include <asm/apic.h> unsigned int nmi_watchdog = NMI_NONE; static unsigned int nmi_hz = HZ; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; -static struct ac_timer nmi_timer[NR_CPUS]; +static struct timer nmi_timer[NR_CPUS]; static unsigned int nmi_timer_ticks[NR_CPUS]; /* @@ -132,7 +134,7 @@ static void nmi_timer_fn(void *unused) { int cpu = smp_processor_id(); nmi_timer_ticks[cpu]++; - set_ac_timer(&nmi_timer[cpu], NOW() + MILLISECS(1000)); + set_timer(&nmi_timer[cpu], NOW() + MILLISECS(1000)); } static void disable_lapic_nmi_watchdog(void) @@ -308,8 +310,6 @@ static int __pminit setup_p4_watchdog(void) void __pminit setup_apic_nmi_watchdog(void) { - int cpu = smp_processor_id(); - if (!nmi_watchdog) return; @@ -344,49 +344,37 @@ void __pminit setup_apic_nmi_watchdog(void) lapic_nmi_owner = LAPIC_NMI_WATCHDOG; nmi_active = 1; - - init_ac_timer(&nmi_timer[cpu], nmi_timer_fn, NULL, cpu); } static unsigned int last_irq_sums [NR_CPUS], alert_counter [NR_CPUS]; -static spinlock_t watchdog_lock = SPIN_LOCK_UNLOCKED; -static unsigned int watchdog_disable_count = 1; -static unsigned int watchdog_on; +static atomic_t watchdog_disable_count = ATOMIC_INIT(1); void watchdog_disable(void) { - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); - - if ( watchdog_disable_count++ == 0 ) - watchdog_on = 0; - - spin_unlock_irqrestore(&watchdog_lock, flags); + atomic_inc(&watchdog_disable_count); } void watchdog_enable(void) { - unsigned int cpu; - unsigned long flags; + static unsigned long heartbeat_initialised; + unsigned int cpu; - spin_lock_irqsave(&watchdog_lock, flags); + if ( !atomic_dec_and_test(&watchdog_disable_count) || + test_and_set_bit(0, &heartbeat_initialised) ) + return; - if ( --watchdog_disable_count == 0 ) + /* + * Activate periodic heartbeats. We cannot do this earlier during + * setup because the timer infrastructure is not available. + */ + for_each_online_cpu ( cpu ) { - watchdog_on = 1; - /* - * Ensure periodic heartbeats are active. We cannot do this earlier - * during setup because the timer infrastructure is not available. - */ - for_each_online_cpu ( cpu ) - set_ac_timer(&nmi_timer[cpu], NOW()); + init_timer(&nmi_timer[cpu], nmi_timer_fn, NULL, cpu); + set_timer(&nmi_timer[cpu], NOW()); } - - spin_unlock_irqrestore(&watchdog_lock, flags); } void nmi_watchdog_tick(struct cpu_user_regs * regs) @@ -395,7 +383,7 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs) sum = nmi_timer_ticks[cpu]; - if ( (last_irq_sums[cpu] == sum) && watchdog_on ) + if ( (last_irq_sums[cpu] == sum) && !atomic_read(&watchdog_disable_count) ) { /* * Ayiee, looks like this CPU is stuck ... wait a few IRQs (5 seconds) @@ -440,3 +428,29 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs) write_watchdog_counter(NULL); } } + +/* + * For some reason the destination shorthand for self is not valid + * when used with the NMI delivery mode. This is documented in Tables + * 8-3 and 8-4 in IA32 Reference Manual Volume 3. We send the IPI to + * our own APIC ID explicitly which is valid. + */ +static void do_nmi_trigger(unsigned char key) +{ + u32 id = apic_read(APIC_ID); + + printk("Triggering NMI on APIC ID %x\n", id); + + local_irq_disable(); + apic_wait_icr_idle(); + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_INT_ASSERT); + local_irq_enable(); +} + +static __init int register_nmi_trigger(void) +{ + register_keyhandler('n', do_nmi_trigger, "trigger an NMI"); + return 0; +} +__initcall(register_nmi_trigger); diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index f27806f8f6..39bf4a523d 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -81,6 +81,10 @@ extern void early_time_init(void); extern void initialize_keytable(void); extern void early_cpu_init(void); +struct tss_struct init_tss[NR_CPUS]; + +struct vcpu *idle_vcpu[NR_CPUS]; + extern unsigned long cpu0_stack[]; struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -92,8 +96,6 @@ unsigned long mmu_cr4_features = X86_CR4_PSE; #endif EXPORT_SYMBOL(mmu_cr4_features); -struct vcpu *idle_task[NR_CPUS] = { &idle0_vcpu }; - int acpi_disabled; int acpi_force; @@ -144,8 +146,8 @@ static struct e820entry e820_raw[E820MAX]; void __init __start_xen(multiboot_info_t *mbi) { - unsigned long vgdt, gdt_pfn; char *cmdline; + struct domain *idle_domain; unsigned long _initrd_start = 0, _initrd_len = 0; unsigned int initrdidx = 1; module_t *mod = (module_t *)__va(mbi->mods_addr); @@ -163,9 +165,8 @@ void __init __start_xen(multiboot_info_t *mbi) if ( (mbi->flags & MBI_CMDLINE) && (mbi->cmdline != 0) ) cmdline_parse(__va(mbi->cmdline)); - /* Must do this early -- e.g., spinlocks rely on get_current(). */ - set_current(&idle0_vcpu); - set_processor_id(0); + set_current((struct vcpu *)0xfffff000); /* debug sanity */ + set_processor_id(0); /* needed early, for smp_processor_id() */ smp_prepare_boot_cpu(); @@ -343,6 +344,12 @@ void __init __start_xen(multiboot_info_t *mbi) BUG_ON(sizeof(shared_info_t) > PAGE_SIZE); BUG_ON(sizeof(vcpu_info_t) != 64); + /* __foo are defined in public headers. Check they match internal defs. */ + BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START); +#ifdef HYPERVISOR_VIRT_END + BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END); +#endif + init_frametable(); end_boot_allocator(); @@ -376,6 +383,14 @@ void __init __start_xen(multiboot_info_t *mbi) early_cpu_init(); + scheduler_init(); + + idle_domain = do_createdomain(IDLE_DOMAIN_ID, 0); + BUG_ON(idle_domain == NULL); + + set_current(idle_domain->vcpu[0]); + idle_vcpu[0] = current; + paging_init(); /* Unmap the first page of CPU0's stack. */ @@ -388,21 +403,6 @@ void __init __start_xen(multiboot_info_t *mbi) sort_exception_tables(); - if ( arch_do_createdomain(current) != 0 ) - BUG(); - - /* - * Map default GDT into its final positions in the idle page table. As - * noted in arch_do_createdomain(), we must map for every possible VCPU#. - */ - vgdt = GDT_VIRT_START(current) + FIRST_RESERVED_GDT_BYTE; - gdt_pfn = virt_to_phys(gdt_table) >> PAGE_SHIFT; - for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - { - map_pages_to_xen(vgdt, gdt_pfn, 1, PAGE_HYPERVISOR); - vgdt += 1 << PDPT_VCPU_VA_SHIFT; - } - find_smp_config(); smp_alloc_memory(); @@ -423,14 +423,12 @@ void __init __start_xen(multiboot_info_t *mbi) trap_init(); - ac_timer_init(); + timer_init(); early_time_init(); arch_init_memory(); - scheduler_init(); - identify_cpu(&boot_cpu_data); if ( cpu_has_fxsr ) set_in_cr4(X86_CR4_OSFXSR); @@ -480,7 +478,8 @@ void __init __start_xen(multiboot_info_t *mbi) schedulers_start(); - watchdog_enable(); + if ( opt_watchdog ) + watchdog_enable(); shadow_mode_init(); diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index 41b76842fd..b2fd143452 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -469,6 +469,7 @@ static unsigned long shadow_l2_table( { unsigned long smfn; l2_pgentry_t *spl2e; + int i; SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn); @@ -503,9 +504,11 @@ static unsigned long shadow_l2_table( spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = - l2e_from_paddr(__pa(page_get_owner(pfn_to_page(gmfn))->arch.mm_perdomain_pt), - __PAGE_HYPERVISOR); + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = + l2e_from_page(virt_to_page(page_get_owner(pfn_to_page(gmfn))-> + arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); if ( shadow_mode_translate(d) ) // NB: not external { @@ -1800,7 +1803,7 @@ static void sync_all(struct domain *d) } /* Other VCPUs mustn't use the revoked writable mappings. */ - other_vcpus_mask = d->cpumask; + other_vcpus_mask = d->domain_dirty_cpumask; cpu_clear(smp_processor_id(), other_vcpus_mask); flush_tlb_mask(other_vcpus_mask); @@ -2150,8 +2153,8 @@ static void shadow_update_pagetables(struct vcpu *v) if ( max_mode & (SHM_enable | SHM_external) ) { if ( likely(v->arch.guest_vtable != NULL) ) - unmap_domain_page(v->arch.guest_vtable); - v->arch.guest_vtable = map_domain_page(gmfn); + unmap_domain_page_global(v->arch.guest_vtable); + v->arch.guest_vtable = map_domain_page_global(gmfn); } /* @@ -2187,8 +2190,8 @@ static void shadow_update_pagetables(struct vcpu *v) ) { if ( v->arch.shadow_vtable ) - unmap_domain_page(v->arch.shadow_vtable); - v->arch.shadow_vtable = map_domain_page(smfn); + unmap_domain_page_global(v->arch.shadow_vtable); + v->arch.shadow_vtable = map_domain_page_global(smfn); } #if CONFIG_PAGING_LEVELS == 2 @@ -2204,8 +2207,8 @@ static void shadow_update_pagetables(struct vcpu *v) if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); if ( v->arch.hl2_vtable ) - unmap_domain_page(v->arch.hl2_vtable); - v->arch.hl2_vtable = map_domain_page(hl2mfn); + unmap_domain_page_global(v->arch.hl2_vtable); + v->arch.hl2_vtable = map_domain_page_global(hl2mfn); } /* diff --git a/xen/arch/x86/shadow32.c b/xen/arch/x86/shadow32.c index 872c73f545..eb09ea92c5 100644 --- a/xen/arch/x86/shadow32.c +++ b/xen/arch/x86/shadow32.c @@ -726,6 +726,7 @@ static void alloc_monitor_pagetable(struct vcpu *v) l2_pgentry_t *mpl2e; struct pfn_info *mmfn_info; struct domain *d = v->domain; + int i; ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0); @@ -733,16 +734,17 @@ static void alloc_monitor_pagetable(struct vcpu *v) ASSERT(mmfn_info != NULL); mmfn = page_to_pfn(mmfn_info); - mpl2e = (l2_pgentry_t *)map_domain_page(mmfn); + mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn); memset(mpl2e, 0, PAGE_SIZE); memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = - l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), - __PAGE_HYPERVISOR); + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = + l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); // map the phys_to_machine map into the Read-Only MPT space for this domain mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = @@ -794,7 +796,7 @@ void free_monitor_pagetable(struct vcpu *v) * Then free monitor_table. */ mfn = pagetable_get_pfn(v->arch.monitor_table); - unmap_domain_page(v->arch.monitor_vtable); + unmap_domain_page_global(v->arch.monitor_vtable); free_domheap_page(pfn_to_page(mfn)); v->arch.monitor_table = mk_pagetable(0); @@ -929,7 +931,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) if ( v->arch.guest_vtable && (v->arch.guest_vtable != __linear_l2_table) ) { - unmap_domain_page(v->arch.guest_vtable); + unmap_domain_page_global(v->arch.guest_vtable); } if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) v->arch.guest_vtable = __linear_l2_table; @@ -942,7 +944,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) if ( v->arch.shadow_vtable && (v->arch.shadow_vtable != __shadow_linear_l2_table) ) { - unmap_domain_page(v->arch.shadow_vtable); + unmap_domain_page_global(v->arch.shadow_vtable); } if ( !(mode & SHM_external) ) v->arch.shadow_vtable = __shadow_linear_l2_table; @@ -955,7 +957,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) if ( v->arch.hl2_vtable && (v->arch.hl2_vtable != __linear_hl2_table) ) { - unmap_domain_page(v->arch.hl2_vtable); + unmap_domain_page_global(v->arch.hl2_vtable); } if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) v->arch.hl2_vtable = __linear_hl2_table; @@ -1508,6 +1510,7 @@ static unsigned long shadow_l2_table( { unsigned long smfn; l2_pgentry_t *spl2e; + int i; SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn); @@ -1542,9 +1545,11 @@ static unsigned long shadow_l2_table( spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_from_pfn(smfn, __PAGE_HYPERVISOR); - spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = - l2e_from_paddr(__pa(page_get_owner(pfn_to_page(gmfn))->arch.mm_perdomain_pt), - __PAGE_HYPERVISOR); + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = + l2e_from_page(virt_to_page(page_get_owner(pfn_to_page(gmfn))-> + arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); if ( shadow_mode_translate(d) ) // NB: not external { @@ -2586,7 +2591,7 @@ void __shadow_sync_all(struct domain *d) } /* Other VCPUs mustn't use the revoked writable mappings. */ - other_vcpus_mask = d->cpumask; + other_vcpus_mask = d->domain_dirty_cpumask; cpu_clear(smp_processor_id(), other_vcpus_mask); flush_tlb_mask(other_vcpus_mask); @@ -2906,8 +2911,8 @@ void __update_pagetables(struct vcpu *v) if ( max_mode & (SHM_enable | SHM_external) ) { if ( likely(v->arch.guest_vtable != NULL) ) - unmap_domain_page(v->arch.guest_vtable); - v->arch.guest_vtable = map_domain_page(gmfn); + unmap_domain_page_global(v->arch.guest_vtable); + v->arch.guest_vtable = map_domain_page_global(gmfn); } /* @@ -2932,8 +2937,8 @@ void __update_pagetables(struct vcpu *v) if ( max_mode == SHM_external ) { if ( v->arch.shadow_vtable ) - unmap_domain_page(v->arch.shadow_vtable); - v->arch.shadow_vtable = map_domain_page(smfn); + unmap_domain_page_global(v->arch.shadow_vtable); + v->arch.shadow_vtable = map_domain_page_global(smfn); } /* @@ -2948,8 +2953,8 @@ void __update_pagetables(struct vcpu *v) if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); if ( v->arch.hl2_vtable ) - unmap_domain_page(v->arch.hl2_vtable); - v->arch.hl2_vtable = map_domain_page(hl2mfn); + unmap_domain_page_global(v->arch.hl2_vtable); + v->arch.hl2_vtable = map_domain_page_global(hl2mfn); } /* diff --git a/xen/arch/x86/shadow_public.c b/xen/arch/x86/shadow_public.c index 931a31f83f..bb376bb737 100644 --- a/xen/arch/x86/shadow_public.c +++ b/xen/arch/x86/shadow_public.c @@ -151,6 +151,8 @@ free_shadow_fl1_table(struct domain *d, unsigned long smfn) for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) put_page_from_l1e(pl1e[i], d); + + unmap_domain_page(pl1e); } /* @@ -254,6 +256,7 @@ static pagetable_t page_table_convert(struct domain *d) pae_l3 = map_domain_page(pagetable_get_pfn(d->arch.phys_table)); for (i = 0; i < PDP_ENTRIES; i++) l3[i] = l3e_from_pfn(l3e_get_pfn(pae_l3[i]), __PAGE_HYPERVISOR); + unmap_domain_page(pae_l3); unmap_domain_page(l4); unmap_domain_page(l3); @@ -275,7 +278,7 @@ static void alloc_monitor_pagetable(struct vcpu *v) ASSERT( mmfn_info ); mmfn = page_to_pfn(mmfn_info); - mpl4e = (l4_pgentry_t *) map_domain_page(mmfn); + mpl4e = (l4_pgentry_t *) map_domain_page_global(mmfn); memcpy(mpl4e, &idle_pg_table[0], PAGE_SIZE); mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] = l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); @@ -298,7 +301,7 @@ void free_monitor_pagetable(struct vcpu *v) * free monitor_table. */ mfn = pagetable_get_pfn(v->arch.monitor_table); - unmap_domain_page(v->arch.monitor_vtable); + unmap_domain_page_global(v->arch.monitor_vtable); free_domheap_page(pfn_to_page(mfn)); v->arch.monitor_table = mk_pagetable(0); @@ -325,6 +328,7 @@ static void alloc_monitor_pagetable(struct vcpu *v) l2_pgentry_t *mpl2e; struct pfn_info *mmfn_info; struct domain *d = v->domain; + int i; ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0); @@ -332,16 +336,17 @@ static void alloc_monitor_pagetable(struct vcpu *v) ASSERT(mmfn_info != NULL); mmfn = page_to_pfn(mmfn_info); - mpl2e = (l2_pgentry_t *)map_domain_page(mmfn); + mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn); memset(mpl2e, 0, PAGE_SIZE); memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = - l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), - __PAGE_HYPERVISOR); + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] = + l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); // map the phys_to_machine map into the Read-Only MPT space for this domain mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = @@ -393,7 +398,7 @@ void free_monitor_pagetable(struct vcpu *v) * Then free monitor_table. */ mfn = pagetable_get_pfn(v->arch.monitor_table); - unmap_domain_page(v->arch.monitor_vtable); + unmap_domain_page_global(v->arch.monitor_vtable); free_domheap_page(pfn_to_page(mfn)); v->arch.monitor_table = mk_pagetable(0); @@ -977,7 +982,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) if ( v->arch.guest_vtable && (v->arch.guest_vtable != __linear_l2_table) ) { - unmap_domain_page(v->arch.guest_vtable); + unmap_domain_page_global(v->arch.guest_vtable); } if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) v->arch.guest_vtable = __linear_l2_table; @@ -990,7 +995,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) if ( v->arch.shadow_vtable && (v->arch.shadow_vtable != __shadow_linear_l2_table) ) { - unmap_domain_page(v->arch.shadow_vtable); + unmap_domain_page_global(v->arch.shadow_vtable); } if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2) v->arch.shadow_vtable = __shadow_linear_l2_table; @@ -1004,7 +1009,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) if ( v->arch.hl2_vtable && (v->arch.hl2_vtable != __linear_hl2_table) ) { - unmap_domain_page(v->arch.hl2_vtable); + unmap_domain_page_global(v->arch.hl2_vtable); } if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) v->arch.hl2_vtable = __linear_hl2_table; diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index 30ca4864b2..b3cc714bcd 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -435,7 +435,7 @@ void __init start_secondary(void *unused) extern void percpu_traps_init(void); - set_current(idle_task[cpu]); + set_current(idle_vcpu[cpu]); set_processor_id(cpu); percpu_traps_init(); @@ -761,7 +761,6 @@ static int __init do_boot_cpu(int apicid) * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. */ { - struct domain *idle; struct vcpu *v; unsigned long boot_error; int timeout, cpu; @@ -770,14 +769,10 @@ static int __init do_boot_cpu(int apicid) cpu = ++cpucount; - if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL ) - panic("failed 'createdomain' for CPU %d", cpu); + v = idle_vcpu[cpu] = alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu); + BUG_ON(v == NULL); - v = idle_task[cpu] = idle->vcpu[0]; - - set_bit(_DOMF_idle_domain, &idle->domain_flags); - - v->arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); + v->arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index 7e7c40fca1..1bd15c6702 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -17,7 +17,7 @@ #include <xen/config.h> #include <xen/init.h> #include <xen/time.h> -#include <xen/ac_timer.h> +#include <xen/timer.h> #include <xen/smp.h> #include <xen/irq.h> #include <xen/softirq.h> @@ -56,7 +56,7 @@ struct cpu_time { s_time_t stime_local_stamp; s_time_t stime_master_stamp; struct time_scale tsc_scale; - struct ac_timer calibration_timer; + struct timer calibration_timer; } __cacheline_aligned; static struct cpu_time cpu_time[NR_CPUS]; @@ -163,7 +163,7 @@ void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) /* Rough hack to allow accurate timers to sort-of-work with no APIC. */ if ( !cpu_has_apic ) - raise_softirq(AC_TIMER_SOFTIRQ); + raise_softirq(TIMER_SOFTIRQ); if ( using_pit ) pit_overflow(); @@ -342,7 +342,7 @@ static void init_pit(void) /* Protected by platform_timer_lock. */ static u64 hpet_counter64, hpet_overflow_period; static u32 hpet_stamp; -static struct ac_timer hpet_overflow_timer; +static struct timer hpet_overflow_timer; static void hpet_overflow(void *unused) { @@ -354,7 +354,7 @@ static void hpet_overflow(void *unused) hpet_stamp = counter; spin_unlock_irq(&platform_timer_lock); - set_ac_timer(&hpet_overflow_timer, NOW() + hpet_overflow_period); + set_timer(&hpet_overflow_timer, NOW() + hpet_overflow_period); } static u64 read_hpet_count(void) @@ -430,7 +430,7 @@ static int init_hpet(void) (void)do_div(hpet_overflow_period, (u32)hpet_rate); } - init_ac_timer(&hpet_overflow_timer, hpet_overflow, NULL, 0); + init_timer(&hpet_overflow_timer, hpet_overflow, NULL, 0); hpet_overflow(NULL); platform_timer_stamp = hpet_counter64; @@ -459,7 +459,7 @@ int use_cyclone; /* Protected by platform_timer_lock. */ static u64 cyclone_counter64; static u32 cyclone_stamp; -static struct ac_timer cyclone_overflow_timer; +static struct timer cyclone_overflow_timer; static volatile u32 *cyclone_timer; /* Cyclone MPMC0 register */ static void cyclone_overflow(void *unused) @@ -472,7 +472,7 @@ static void cyclone_overflow(void *unused) cyclone_stamp = counter; spin_unlock_irq(&platform_timer_lock); - set_ac_timer(&cyclone_overflow_timer, NOW() + MILLISECS(20000)); + set_timer(&cyclone_overflow_timer, NOW() + MILLISECS(20000)); } static u64 read_cyclone_count(void) @@ -510,7 +510,7 @@ static int init_cyclone(void) read_platform_count = read_cyclone_count; - init_ac_timer(&cyclone_overflow_timer, cyclone_overflow, NULL, 0); + init_timer(&cyclone_overflow_timer, cyclone_overflow, NULL, 0); cyclone_overflow(NULL); platform_timer_stamp = cyclone_counter64; set_time_scale(&platform_timer_scale, CYCLONE_TIMER_FREQ); @@ -876,7 +876,7 @@ static void local_time_calibration(void *unused) cpu_time[cpu].stime_master_stamp = curr_master_stime; out: - set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH); + set_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH); if ( cpu == 0 ) platform_time_calibration(); @@ -896,9 +896,9 @@ void init_percpu_time(void) cpu_time[cpu].stime_master_stamp = now; cpu_time[cpu].stime_local_stamp = now; - init_ac_timer(&cpu_time[cpu].calibration_timer, + init_timer(&cpu_time[cpu].calibration_timer, local_time_calibration, NULL, cpu); - set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH); + set_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH); } /* Late init function (after all CPUs are booted). */ diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index a4be3db3b3..0a7280fb70 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -130,9 +130,19 @@ unsigned long kernel_text_end(void) static void show_guest_stack(struct cpu_user_regs *regs) { int i; - unsigned long *stack = (unsigned long *)regs->esp, addr; + unsigned long *stack, addr; - printk("Guest stack trace from "__OP"sp=%p:\n ", stack); + if ( VM86_MODE(regs) ) + { + stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff)); + printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ", + regs->ss, (uint16_t)(regs->esp & 0xffff)); + } + else + { + stack = (unsigned long *)regs->esp; + printk("Guest stack trace from "__OP"sp=%p:\n ", stack); + } for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ ) { @@ -427,7 +437,7 @@ void propagate_page_fault(unsigned long addr, u16 error_code) tb->flags |= TBF_INTERRUPT; } -static int handle_perdomain_mapping_fault( +static int handle_gdt_ldt_mapping_fault( unsigned long offset, struct cpu_user_regs *regs) { extern int map_ldt_shadow_page(unsigned int); @@ -437,14 +447,14 @@ static int handle_perdomain_mapping_fault( int ret; /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */ - unsigned int is_ldt_area = (offset >> (PDPT_VCPU_VA_SHIFT-1)) & 1; - unsigned int vcpu_area = (offset >> PDPT_VCPU_VA_SHIFT); + unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1; + unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT); /* Should never fault in another vcpu's area. */ BUG_ON(vcpu_area != current->vcpu_id); /* Byte offset within the gdt/ldt sub-area. */ - offset &= (1UL << (PDPT_VCPU_VA_SHIFT-1)) - 1UL; + offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL; if ( likely(is_ldt_area) ) { @@ -490,9 +500,9 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) { if ( shadow_mode_external(d) && GUEST_CONTEXT(v, regs) ) return shadow_fault(addr, regs); - if ( (addr >= PERDOMAIN_VIRT_START) && (addr < PERDOMAIN_VIRT_END) ) - return handle_perdomain_mapping_fault( - addr - PERDOMAIN_VIRT_START, regs); + if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) + return handle_gdt_ldt_mapping_fault( + addr - GDT_LDT_VIRT_START, regs); } else if ( unlikely(shadow_mode_enabled(d)) ) { @@ -596,7 +606,6 @@ static inline int guest_io_okay( u16 x; #if defined(__x86_64__) /* If in user mode, switch to kernel mode just to read I/O bitmap. */ - extern void toggle_guest_mode(struct vcpu *); int user_mode = !(v->arch.flags & TF_kernel_mode); #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) #elif defined(__i386__) @@ -964,16 +973,26 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) case 0x30: /* WRMSR */ /* Ignore the instruction if unprivileged. */ if ( !IS_PRIV(v->domain) ) - DPRINTK("Non-priv domain attempted WRMSR(%p,%08lx,%08lx).\n", - _p(regs->ecx), (long)regs->eax, (long)regs->edx); + { + u32 l, h; + if ( (rdmsr_user(regs->ecx, l, h) != 0) || + (regs->ecx != MSR_EFER) || + (regs->eax != l) || (regs->edx != h) ) + DPRINTK("Non-priv domain attempted WRMSR %p from " + "%08x:%08x to %08lx:%08lx.\n", + _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax); + } else if ( wrmsr_user(regs->ecx, regs->eax, regs->edx) ) goto fail; break; case 0x32: /* RDMSR */ if ( !IS_PRIV(v->domain) ) - DPRINTK("Non-priv domain attempted RDMSR(%p,%08lx,%08lx).\n", - _p(regs->ecx), (long)regs->eax, (long)regs->edx); + { + if ( regs->ecx != MSR_EFER ) + DPRINTK("Non-priv domain attempted RDMSR %p.\n", + _p(regs->ecx)); + } /* Everyone can read the MSR space. */ if ( rdmsr_user(regs->ecx, regs->eax, regs->edx) ) goto fail; @@ -1080,26 +1099,23 @@ asmlinkage int do_general_protection(struct cpu_user_regs *regs) return 0; } +static void nmi_softirq(void) +{ + /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */ + evtchn_notify(dom0->vcpu[0]); +} -/* Defer dom0 notification to softirq context (unsafe in NMI context). */ -static unsigned long nmi_dom0_softirq_reason; -#define NMI_DOM0_PARITY_ERR 0 -#define NMI_DOM0_IO_ERR 1 -#define NMI_DOM0_UNKNOWN 2 - -static void nmi_dom0_softirq(void) +static void nmi_dom0_report(unsigned int reason_idx) { - if ( dom0 == NULL ) - return; + struct domain *d; - if ( test_and_clear_bit(NMI_DOM0_PARITY_ERR, &nmi_dom0_softirq_reason) ) - send_guest_virq(dom0->vcpu[0], VIRQ_PARITY_ERR); + if ( (d = dom0) == NULL ) + return; - if ( test_and_clear_bit(NMI_DOM0_IO_ERR, &nmi_dom0_softirq_reason) ) - send_guest_virq(dom0->vcpu[0], VIRQ_IO_ERR); + set_bit(reason_idx, &d->shared_info->arch.nmi_reason); - if ( test_and_clear_bit(NMI_DOM0_UNKNOWN, &nmi_dom0_softirq_reason) ) - send_guest_virq(dom0->vcpu[0], VIRQ_NMI); + if ( test_and_set_bit(_VCPUF_nmi_pending, &d->vcpu[0]->vcpu_flags) ) + raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */ } asmlinkage void mem_parity_error(struct cpu_user_regs *regs) @@ -1107,8 +1123,7 @@ asmlinkage void mem_parity_error(struct cpu_user_regs *regs) switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ - set_bit(NMI_DOM0_PARITY_ERR, &nmi_dom0_softirq_reason); - raise_softirq(NMI_DOM0_SOFTIRQ); + nmi_dom0_report(_XEN_NMIREASON_parity_error); case 'i': /* 'ignore' */ break; default: /* 'fatal' */ @@ -1127,8 +1142,7 @@ asmlinkage void io_check_error(struct cpu_user_regs *regs) switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ - set_bit(NMI_DOM0_IO_ERR, &nmi_dom0_softirq_reason); - raise_softirq(NMI_DOM0_SOFTIRQ); + nmi_dom0_report(_XEN_NMIREASON_io_error); case 'i': /* 'ignore' */ break; default: /* 'fatal' */ @@ -1147,8 +1161,7 @@ static void unknown_nmi_error(unsigned char reason) switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ - set_bit(NMI_DOM0_UNKNOWN, &nmi_dom0_softirq_reason); - raise_softirq(NMI_DOM0_SOFTIRQ); + nmi_dom0_report(_XEN_NMIREASON_unknown); case 'i': /* 'ignore' */ break; default: /* 'fatal' */ @@ -1347,7 +1360,7 @@ void __init trap_init(void) cpu_init(); - open_softirq(NMI_DOM0_SOFTIRQ, nmi_dom0_softirq); + open_softirq(NMI_SOFTIRQ, nmi_softirq); } diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c index 3cb18be4c2..6d6fa51764 100644 --- a/xen/arch/x86/vmx.c +++ b/xen/arch/x86/vmx.c @@ -42,7 +42,7 @@ #include <asm/shadow_64.h> #endif #include <public/sched.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx_vpic.h> #include <asm/vmx_vlapic.h> @@ -53,7 +53,7 @@ unsigned int opt_vmx_debug_level = 0; integer_param("vmx_debug", opt_vmx_debug_level); static unsigned long trace_values[NR_CPUS][4]; -#define TRACE_VMEXIT(index,value) trace_values[current->processor][index]=value +#define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value static int vmx_switch_on; @@ -66,11 +66,6 @@ void vmx_final_setup_guest(struct vcpu *v) struct domain *d = v->domain; struct vcpu *vc; - d->arch.vmx_platform.lapic_enable = v->arch.guest_context.user_regs.ecx; - v->arch.guest_context.user_regs.ecx = 0; - VMX_DBG_LOG(DBG_LEVEL_VLAPIC, "lapic enable is %d.\n", - d->arch.vmx_platform.lapic_enable); - /* Initialize monitor page table */ for_each_vcpu(d, vc) vc->arch.monitor_table = mk_pagetable(0); @@ -95,7 +90,7 @@ void vmx_final_setup_guest(struct vcpu *v) void vmx_relinquish_resources(struct vcpu *v) { struct vmx_virpit *vpit; - + if ( !VMX_DOMAIN(v) ) return; @@ -103,19 +98,18 @@ void vmx_relinquish_resources(struct vcpu *v) /* unmap IO shared page */ struct domain *d = v->domain; if ( d->arch.vmx_platform.shared_page_va ) - unmap_domain_page((void *)d->arch.vmx_platform.shared_page_va); + unmap_domain_page_global( + (void *)d->arch.vmx_platform.shared_page_va); } destroy_vmcs(&v->arch.arch_vmx); free_monitor_pagetable(v); vpit = &v->domain->arch.vmx_platform.vmx_pit; - if ( active_ac_timer(&(vpit->pit_timer)) ) - rem_ac_timer(&vpit->pit_timer); - if ( active_ac_timer(&v->arch.arch_vmx.hlt_timer) ) - rem_ac_timer(&v->arch.arch_vmx.hlt_timer); + kill_timer(&vpit->pit_timer); + kill_timer(&v->arch.arch_vmx.hlt_timer); if ( vmx_apic_support(v->domain) && (VLAPIC(v) != NULL) ) { - rem_ac_timer(&VLAPIC(v)->vlapic_timer); + kill_timer(&VLAPIC(v)->vlapic_timer); xfree(VLAPIC(v)); } } @@ -1604,7 +1598,7 @@ void vmx_vmexit_do_hlt(void) next_wakeup = next_pit; } if ( next_wakeup != - 1 ) - set_ac_timer(¤t->arch.arch_vmx.hlt_timer, next_wakeup); + set_timer(¤t->arch.arch_vmx.hlt_timer, next_wakeup); do_block(); } @@ -1955,9 +1949,12 @@ asmlinkage void load_cr2(void) asmlinkage void trace_vmentry (void) { - TRACE_5D(TRC_VMENTRY,trace_values[current->processor][0], - trace_values[current->processor][1],trace_values[current->processor][2], - trace_values[current->processor][3],trace_values[current->processor][4]); + TRACE_5D(TRC_VMENTRY, + trace_values[smp_processor_id()][0], + trace_values[smp_processor_id()][1], + trace_values[smp_processor_id()][2], + trace_values[smp_processor_id()][3], + trace_values[smp_processor_id()][4]); TRACE_VMEXIT(0,9); TRACE_VMEXIT(1,9); TRACE_VMEXIT(2,9); diff --git a/xen/arch/x86/vmx_intercept.c b/xen/arch/x86/vmx_intercept.c index 8bac8a8e5c..419960842c 100644 --- a/xen/arch/x86/vmx_intercept.c +++ b/xen/arch/x86/vmx_intercept.c @@ -24,7 +24,7 @@ #include <asm/vmx_vpit.h> #include <asm/vmx_intercept.h> #include <asm/vmx_vlapic.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <xen/lib.h> #include <xen/sched.h> #include <asm/current.h> @@ -356,19 +356,19 @@ static void pit_timer_fn(void *data) vpit->pending_intr_nr++; if ( test_bit(_VCPUF_running, &v->vcpu_flags) ) { vpit->scheduled += vpit->period; - set_ac_timer(&vpit->pit_timer, vpit->scheduled); + set_timer(&vpit->pit_timer, vpit->scheduled); } } void pickup_deactive_ticks(struct vmx_virpit *vpit) { - if ( !active_ac_timer(&(vpit->pit_timer)) ) { + if ( !active_timer(&(vpit->pit_timer)) ) { /* pick up missed timer tick */ missed_ticks(vpit); vpit->scheduled += vpit->period; - set_ac_timer(&vpit->pit_timer, vpit->scheduled); + set_timer(&vpit->pit_timer, vpit->scheduled); } } @@ -385,14 +385,14 @@ void vmx_hooks_assist(struct vcpu *v) /* load init count*/ if (p->state == STATE_IORESP_HOOK) { /* set up actimer, handle re-init */ - if ( active_ac_timer(&(vpit->pit_timer)) ) { + if ( active_timer(&(vpit->pit_timer)) ) { VMX_DBG_LOG(DBG_LEVEL_1, "VMX_PIT: guest reset PIT with channel %lx!\n", (unsigned long) ((p->u.data >> 24) & 0x3) ); - rem_ac_timer(&(vpit->pit_timer)); + stop_timer(&(vpit->pit_timer)); reinit = 1; } else { - init_ac_timer(&vpit->pit_timer, pit_timer_fn, v, v->processor); + init_timer(&vpit->pit_timer, pit_timer_fn, v, v->processor); } /* init count for this channel */ @@ -431,7 +431,7 @@ void vmx_hooks_assist(struct vcpu *v) } vpit->scheduled = NOW() + vpit->period; - set_ac_timer(&vpit->pit_timer, vpit->scheduled); + set_timer(&vpit->pit_timer, vpit->scheduled); /*restore the state*/ p->state = STATE_IORESP_READY; diff --git a/xen/arch/x86/vmx_io.c b/xen/arch/x86/vmx_io.c index b7689228bf..c979a8d741 100644 --- a/xen/arch/x86/vmx_io.c +++ b/xen/arch/x86/vmx_io.c @@ -37,7 +37,7 @@ #include <asm/shadow.h> #include <asm/vmx_vpic.h> #include <asm/vmx_vlapic.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #ifdef CONFIG_VMX #if defined (__i386__) @@ -819,7 +819,7 @@ interrupt_post_injection(struct vcpu * v, int vector, int type) if ( !vpit->first_injected ) { vpit->pending_intr_nr = 0; vpit->scheduled = NOW() + vpit->period; - set_ac_timer(&vpit->pit_timer, vpit->scheduled); + set_timer(&vpit->pit_timer, vpit->scheduled); vpit->first_injected = 1; } else { vpit->pending_intr_nr--; diff --git a/xen/arch/x86/vmx_platform.c b/xen/arch/x86/vmx_platform.c index 2ee14c65ec..45d1e0052b 100644 --- a/xen/arch/x86/vmx_platform.c +++ b/xen/arch/x86/vmx_platform.c @@ -27,7 +27,7 @@ #include <xen/trace.h> #include <asm/vmx.h> #include <asm/vmx_platform.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <xen/lib.h> #include <xen/sched.h> diff --git a/xen/arch/x86/vmx_vlapic.c b/xen/arch/x86/vmx_vlapic.c index fa1dc2118d..d487f9739e 100644 --- a/xen/arch/x86/vmx_vlapic.c +++ b/xen/arch/x86/vmx_vlapic.c @@ -32,7 +32,7 @@ #include <xen/lib.h> #include <xen/sched.h> #include <asm/current.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #ifdef CONFIG_VMX @@ -62,7 +62,7 @@ int vlapic_find_highest_irr(struct vlapic *vlapic) int vmx_apic_support(struct domain *d) { - return d->arch.vmx_platform.lapic_enable; + return d->arch.vmx_platform.apic_enabled; } s_time_t get_apictime_scheduled(struct vcpu *v) @@ -391,7 +391,7 @@ static void vlapic_begin_timer(struct vlapic *vlapic) (262144 / get_apic_bus_scale()) * vlapic->timer_divide_counter; vlapic->vlapic_timer.expires = cur + offset; - set_ac_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires ); + set_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires ); VMX_DBG_LOG(DBG_LEVEL_VLAPIC, "vlapic_begin_timer: " "bus_scale %x now %08x%08x expire %08x%08x " @@ -739,7 +739,7 @@ static void vlapic_write(struct vcpu *v, unsigned long address, case APIC_TMICT: if (vlapic_timer_active(vlapic)) - rem_ac_timer(&(vlapic->vlapic_timer)); + stop_timer(&(vlapic->vlapic_timer)); vlapic->timer_initial = val; vlapic->timer_current = val; @@ -846,7 +846,7 @@ void vlapic_timer_fn(void *data) vlapic->timer_current = vlapic->timer_initial; offset = vlapic->timer_current * (262144/get_apic_bus_scale()) * vlapic->timer_divide_counter; vlapic->vlapic_timer.expires = NOW() + offset; - set_ac_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires); + set_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires); }else { vlapic->timer_current = 0; } @@ -986,7 +986,7 @@ static int vlapic_reset(struct vlapic *vlapic) vmx_vioapic_add_lapic(vlapic, v); - init_ac_timer(&vlapic->vlapic_timer, + init_timer(&vlapic->vlapic_timer, vlapic_timer_fn, vlapic, v->processor); #ifdef VLAPIC_NO_BIOS diff --git a/xen/arch/x86/vmx_vmcs.c b/xen/arch/x86/vmx_vmcs.c index 17eb2caad3..9b7c9d41d2 100644 --- a/xen/arch/x86/vmx_vmcs.c +++ b/xen/arch/x86/vmx_vmcs.c @@ -32,7 +32,7 @@ #include <asm/flushtlb.h> #include <xen/event.h> #include <xen/kernel.h> -#include <public/io/ioreq.h> +#include <public/hvm/hvm_info_table.h> #if CONFIG_PAGING_LEVELS >= 4 #include <asm/shadow_64.h> #endif @@ -193,7 +193,7 @@ static void vmx_map_io_shared_page(struct domain *d) domain_crash_synchronous(); } - p = map_domain_page(mpfn); + p = map_domain_page_global(mpfn); if (p == NULL) { printk("Can not map io request shared page for VMX domain.\n"); domain_crash_synchronous(); @@ -206,35 +206,55 @@ static void vmx_map_io_shared_page(struct domain *d) &d->shared_info->evtchn_mask[0]); } -#define VCPU_NR_PAGE 0x0009F000 -#define VCPU_NR_OFFSET 0x00000800 -#define VCPU_MAGIC 0x76637075 /* "vcpu" */ +static int validate_hvm_info(struct hvm_info_table *t) +{ + char signature[] = "HVM INFO"; + uint8_t *ptr = (uint8_t *)t; + uint8_t sum = 0; + int i; + + /* strncmp(t->signature, "HVM INFO", 8) */ + for ( i = 0; i < 8; i++ ) { + if ( signature[i] != t->signature[i] ) { + printk("Bad hvm info signature\n"); + return 0; + } + } + + for ( i = 0; i < t->length; i++ ) + sum += ptr[i]; -static void vmx_set_vcpu_nr(struct domain *d) + return (sum == 0); +} + +static void vmx_get_hvm_info(struct domain *d) { unsigned char *p; unsigned long mpfn; - unsigned int *vcpus; + struct hvm_info_table *t; - mpfn = get_mfn_from_pfn(VCPU_NR_PAGE >> PAGE_SHIFT); - if (mpfn == INVALID_MFN) { - printk("Can not get vcpu number page mfn for VMX domain.\n"); + mpfn = get_mfn_from_pfn(HVM_INFO_PFN); + if ( mpfn == INVALID_MFN ) { + printk("Can not get hvm info page mfn for VMX domain.\n"); domain_crash_synchronous(); } p = map_domain_page(mpfn); - if (p == NULL) { - printk("Can not map vcpu number page for VMX domain.\n"); + if ( p == NULL ) { + printk("Can not map hvm info page for VMX domain.\n"); domain_crash_synchronous(); } - vcpus = (unsigned int *)(p + VCPU_NR_OFFSET); - if (vcpus[0] != VCPU_MAGIC) { - printk("Bad vcpus magic, set vcpu number to 1 by default.\n"); - d->arch.vmx_platform.nr_vcpu = 1; - } + t = (struct hvm_info_table *)(p + HVM_INFO_OFFSET); - d->arch.vmx_platform.nr_vcpu = vcpus[1]; + if ( validate_hvm_info(t) ) { + d->arch.vmx_platform.nr_vcpus = t->nr_vcpus; + d->arch.vmx_platform.apic_enabled = t->apic_enabled; + } else { + printk("Bad hvm info table\n"); + d->arch.vmx_platform.nr_vcpus = 1; + d->arch.vmx_platform.apic_enabled = 0; + } unmap_domain_page(p); } @@ -244,10 +264,10 @@ static void vmx_setup_platform(struct domain* d) struct vmx_platform *platform; vmx_map_io_shared_page(d); - vmx_set_vcpu_nr(d); + vmx_get_hvm_info(d); platform = &d->arch.vmx_platform; - pic_init(&platform->vmx_pic, pic_irq_request, + pic_init(&platform->vmx_pic, pic_irq_request, &platform->interrupt_request); register_pic_io_hook(); @@ -321,7 +341,7 @@ static void vmx_do_launch(struct vcpu *v) vlapic_init(v); vmx_set_host_env(v); - init_ac_timer(&v->arch.arch_vmx.hlt_timer, hlt_timer_fn, v, v->processor); + init_timer(&v->arch.arch_vmx.hlt_timer, hlt_timer_fn, v, v->processor); error |= __vmwrite(GUEST_LDTR_SELECTOR, 0); error |= __vmwrite(GUEST_LDTR_BASE, 0); @@ -335,6 +355,8 @@ static void vmx_do_launch(struct vcpu *v) __vmwrite(HOST_RSP, (unsigned long)get_stack_bottom()); v->arch.schedule_tail = arch_vmx_do_resume; + v->arch.arch_vmx.launch_cpu = smp_processor_id(); + /* init guest tsc to start from 0 */ rdtscll(host_tsc); v->arch.arch_vmx.tsc_offset = 0 - host_tsc; @@ -617,11 +639,21 @@ void vm_resume_fail(unsigned long eflags) void arch_vmx_do_resume(struct vcpu *v) { - u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); - - load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); - vmx_do_resume(v); - reset_stack_and_jump(vmx_asm_do_resume); + if ( v->arch.arch_vmx.launch_cpu == smp_processor_id() ) + { + load_vmcs(&v->arch.arch_vmx, virt_to_phys(v->arch.arch_vmx.vmcs)); + vmx_do_resume(v); + reset_stack_and_jump(vmx_asm_do_resume); + } + else + { + __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs)); + load_vmcs(&v->arch.arch_vmx, virt_to_phys(v->arch.arch_vmx.vmcs)); + vmx_do_resume(v); + vmx_set_host_env(v); + v->arch.arch_vmx.launch_cpu = smp_processor_id(); + reset_stack_and_jump(vmx_asm_do_relaunch); + } } void arch_vmx_do_launch(struct vcpu *v) @@ -643,18 +675,6 @@ void arch_vmx_do_launch(struct vcpu *v) reset_stack_and_jump(vmx_asm_do_launch); } -void arch_vmx_do_relaunch(struct vcpu *v) -{ - u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); - - load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); - vmx_do_resume(v); - vmx_set_host_env(v); - v->arch.schedule_tail = arch_vmx_do_resume; - - reset_stack_and_jump(vmx_asm_do_relaunch); -} - #endif /* CONFIG_VMX */ /* diff --git a/xen/arch/x86/x86_32/asm-offsets.c b/xen/arch/x86/x86_32/asm-offsets.c index 3a5c3ef9f8..42bef57240 100644 --- a/xen/arch/x86/x86_32/asm-offsets.c +++ b/xen/arch/x86/x86_32/asm-offsets.c @@ -65,6 +65,10 @@ void __dummy__(void) arch.guest_context.kernel_ss); OFFSET(VCPU_kernel_sp, struct vcpu, arch.guest_context.kernel_sp); + OFFSET(VCPU_flags, struct vcpu, vcpu_flags); + OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr); + DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending); + DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked); BLANK(); OFFSET(VCPUINFO_upcall_pending, vcpu_info_t, evtchn_upcall_pending); diff --git a/xen/arch/x86/x86_32/domain_page.c b/xen/arch/x86/x86_32/domain_page.c index f7c194b775..222e813693 100644 --- a/xen/arch/x86/x86_32/domain_page.c +++ b/xen/arch/x86/x86_32/domain_page.c @@ -1,14 +1,9 @@ /****************************************************************************** * domain_page.h * - * Allow temporary mapping of domain pages. Based on ideas from the - * Linux PKMAP code -- the copyrights and credits are retained below. - */ - -/* - * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de - * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de * - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Allow temporary mapping of domain pages. + * + * Copyright (c) 2003-2006, Keir Fraser <keir@xensource.com> */ #include <xen/config.h> @@ -20,80 +15,203 @@ #include <asm/flushtlb.h> #include <asm/hardirq.h> -#define MAPCACHE_ORDER 10 -#define MAPCACHE_ENTRIES (1 << MAPCACHE_ORDER) - -l1_pgentry_t *mapcache; -static unsigned int map_idx, epoch, shadow_epoch[NR_CPUS]; -static spinlock_t map_lock = SPIN_LOCK_UNLOCKED; - -/* Use a spare PTE bit to mark entries ready for recycling. */ -#define READY_FOR_TLB_FLUSH (1<<10) - -static void flush_all_ready_maps(void) -{ - l1_pgentry_t *cache = mapcache; - unsigned int i; - - for ( i = 0; i < MAPCACHE_ENTRIES; i++ ) - if ( (l1e_get_flags(cache[i]) & READY_FOR_TLB_FLUSH) ) - cache[i] = l1e_empty(); -} - -void *map_domain_pages(unsigned long pfn, unsigned int order) +void *map_domain_page(unsigned long pfn) { unsigned long va; - unsigned int idx, i, flags, cpu = smp_processor_id(); - l1_pgentry_t *cache = mapcache; -#ifndef NDEBUG - unsigned int flush_count = 0; -#endif + unsigned int idx, i, vcpu = current->vcpu_id; + struct domain *d; + struct mapcache *cache; + struct vcpu_maphash_entry *hashent; ASSERT(!in_irq()); + perfc_incrc(map_domain_page_count); - spin_lock(&map_lock); + /* If we are the idle domain, ensure that we run on our own page tables. */ + d = current->domain; + if ( unlikely(is_idle_domain(d)) ) + __sync_lazy_execstate(); - /* Has some other CPU caused a wrap? We must flush if so. */ - if ( epoch != shadow_epoch[cpu] ) + cache = &d->arch.mapcache; + + hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)]; + if ( hashent->pfn == pfn ) { - perfc_incrc(domain_page_tlb_flush); - local_flush_tlb(); - shadow_epoch[cpu] = epoch; + idx = hashent->idx; + hashent->refcnt++; + ASSERT(hashent->refcnt != 0); + ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn); + goto out; } - do { - idx = map_idx = (map_idx + 1) & (MAPCACHE_ENTRIES - 1); - if ( unlikely(idx == 0) ) + spin_lock(&cache->lock); + + /* Has some other CPU caused a wrap? We must flush if so. */ + if ( unlikely(cache->epoch != cache->shadow_epoch[vcpu]) ) + { + cache->shadow_epoch[vcpu] = cache->epoch; + if ( NEED_FLUSH(tlbflush_time[smp_processor_id()], + cache->tlbflush_timestamp) ) { - ASSERT(flush_count++ == 0); - flush_all_ready_maps(); perfc_incrc(domain_page_tlb_flush); local_flush_tlb(); - shadow_epoch[cpu] = ++epoch; + } + } + + idx = find_next_zero_bit(cache->inuse, MAPCACHE_ENTRIES, cache->cursor); + if ( unlikely(idx >= MAPCACHE_ENTRIES) ) + { + /* /First/, clean the garbage map and update the inuse list. */ + for ( i = 0; i < ARRAY_SIZE(cache->garbage); i++ ) + { + unsigned long x = xchg(&cache->garbage[i], 0); + cache->inuse[i] &= ~x; } - flags = 0; - for ( i = 0; i < (1U << order); i++ ) - flags |= l1e_get_flags(cache[idx+i]); + /* /Second/, flush TLBs. */ + perfc_incrc(domain_page_tlb_flush); + local_flush_tlb(); + cache->shadow_epoch[vcpu] = ++cache->epoch; + cache->tlbflush_timestamp = tlbflush_current_time(); + + idx = find_first_zero_bit(cache->inuse, MAPCACHE_ENTRIES); + ASSERT(idx < MAPCACHE_ENTRIES); } - while ( flags & _PAGE_PRESENT ); - for ( i = 0; i < (1U << order); i++ ) - cache[idx+i] = l1e_from_pfn(pfn+i, __PAGE_HYPERVISOR); + set_bit(idx, cache->inuse); + cache->cursor = idx + 1; + + spin_unlock(&cache->lock); - spin_unlock(&map_lock); + cache->l1tab[idx] = l1e_from_pfn(pfn, __PAGE_HYPERVISOR); + out: va = MAPCACHE_VIRT_START + (idx << PAGE_SHIFT); return (void *)va; } -void unmap_domain_pages(void *va, unsigned int order) +void unmap_domain_page(void *va) { - unsigned int idx, i; + unsigned int idx; + struct mapcache *cache = ¤t->domain->arch.mapcache; + unsigned long pfn; + struct vcpu_maphash_entry *hashent; + + ASSERT(!in_irq()); + ASSERT((void *)MAPCACHE_VIRT_START <= va); ASSERT(va < (void *)MAPCACHE_VIRT_END); + idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT; - for ( i = 0; i < (1U << order); i++ ) - l1e_add_flags(mapcache[idx+i], READY_FOR_TLB_FLUSH); + pfn = l1e_get_pfn(cache->l1tab[idx]); + hashent = &cache->vcpu_maphash[current->vcpu_id].hash[MAPHASH_HASHFN(pfn)]; + + if ( hashent->idx == idx ) + { + ASSERT(hashent->pfn == pfn); + ASSERT(hashent->refcnt != 0); + hashent->refcnt--; + } + else if ( hashent->refcnt == 0 ) + { + if ( hashent->idx != MAPHASHENT_NOTINUSE ) + { + /* /First/, zap the PTE. */ + ASSERT(l1e_get_pfn(cache->l1tab[hashent->idx]) == hashent->pfn); + cache->l1tab[hashent->idx] = l1e_empty(); + /* /Second/, mark as garbage. */ + set_bit(hashent->idx, cache->garbage); + } + + /* Add newly-freed mapping to the maphash. */ + hashent->pfn = pfn; + hashent->idx = idx; + } + else + { + /* /First/, zap the PTE. */ + cache->l1tab[idx] = l1e_empty(); + /* /Second/, mark as garbage. */ + set_bit(idx, cache->garbage); + } +} + +void mapcache_init(struct domain *d) +{ + unsigned int i, j; + + d->arch.mapcache.l1tab = d->arch.mm_perdomain_pt + + (GDT_LDT_MBYTES << (20 - PAGE_SHIFT)); + spin_lock_init(&d->arch.mapcache.lock); + + /* Mark all maphash entries as not in use. */ + for ( i = 0; i < MAX_VIRT_CPUS; i++ ) + for ( j = 0; j < MAPHASH_ENTRIES; j++ ) + d->arch.mapcache.vcpu_maphash[i].hash[j].idx = + MAPHASHENT_NOTINUSE; +} + +#define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT)) +static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)]; +static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)]; +static unsigned int inuse_cursor; +static spinlock_t globalmap_lock = SPIN_LOCK_UNLOCKED; + +void *map_domain_page_global(unsigned long pfn) +{ + l2_pgentry_t *pl2e; + l1_pgentry_t *pl1e; + unsigned int idx, i; + unsigned long va; + + ASSERT(!in_irq() && local_irq_is_enabled()); + + spin_lock(&globalmap_lock); + + idx = find_next_zero_bit(inuse, GLOBALMAP_BITS, inuse_cursor); + va = IOREMAP_VIRT_START + (idx << PAGE_SHIFT); + if ( unlikely(va >= FIXADDR_START) ) + { + /* /First/, clean the garbage map and update the inuse list. */ + for ( i = 0; i < ARRAY_SIZE(garbage); i++ ) + { + unsigned long x = xchg(&garbage[i], 0); + inuse[i] &= ~x; + } + + /* /Second/, flush all TLBs to get rid of stale garbage mappings. */ + flush_tlb_all(); + + idx = find_first_zero_bit(inuse, GLOBALMAP_BITS); + va = IOREMAP_VIRT_START + (idx << PAGE_SHIFT); + ASSERT(va < FIXADDR_START); + } + + set_bit(idx, inuse); + inuse_cursor = idx + 1; + + spin_unlock(&globalmap_lock); + + pl2e = virt_to_xen_l2e(va); + pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(va); + *pl1e = l1e_from_pfn(pfn, __PAGE_HYPERVISOR); + + return (void *)va; +} + +void unmap_domain_page_global(void *va) +{ + unsigned long __va = (unsigned long)va; + l2_pgentry_t *pl2e; + l1_pgentry_t *pl1e; + unsigned int idx; + + /* /First/, we zap the PTE. */ + pl2e = virt_to_xen_l2e(__va); + pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va); + *pl1e = l1e_empty(); + + /* /Second/, we add to the garbage map. */ + idx = (__va - IOREMAP_VIRT_START) >> PAGE_SHIFT; + set_bit(idx, garbage); } diff --git a/xen/arch/x86/x86_32/entry.S b/xen/arch/x86/x86_32/entry.S index b890103160..e178d7383e 100644 --- a/xen/arch/x86/x86_32/entry.S +++ b/xen/arch/x86/x86_32/entry.S @@ -326,7 +326,9 @@ test_all_events: shl $IRQSTAT_shift,%eax test %ecx,irq_stat(%eax,1) jnz process_softirqs -/*test_guest_events:*/ + btr $_VCPUF_nmi_pending,VCPU_flags(%ebx) + jc process_nmi +test_guest_events: movl VCPU_vcpu_info(%ebx),%eax testb $0xFF,VCPUINFO_upcall_mask(%eax) jnz restore_all_guest @@ -348,7 +350,24 @@ process_softirqs: sti call do_softirq jmp test_all_events - + + ALIGN +process_nmi: + movl VCPU_nmi_addr(%ebx),%eax + test %eax,%eax + jz test_all_events + bts $_VCPUF_nmi_masked,VCPU_flags(%ebx) + jc 1f + sti + leal VCPU_trap_bounce(%ebx),%edx + movl %eax,TRAPBOUNCE_eip(%edx) + movw $FLAT_KERNEL_CS,TRAPBOUNCE_cs(%edx) + movw $TBF_INTERRUPT,TRAPBOUNCE_flags(%edx) + call create_bounce_frame + jmp test_all_events +1: bts $_VCPUF_nmi_pending,VCPU_flags(%ebx) + jmp test_guest_events + /* CREATE A BASIC EXCEPTION FRAME ON GUEST OS (RING-1) STACK: */ /* {EIP, CS, EFLAGS, [ESP, SS]} */ /* %edx == trap_bounce, %ebx == struct vcpu */ @@ -620,9 +639,7 @@ ENTRY(nmi) jne defer_nmi continue_nmi: - movl $(__HYPERVISOR_DS),%edx - movl %edx,%ds - movl %edx,%es + SET_XEN_SEGMENTS(d) movl %esp,%edx pushl %edx call do_nmi @@ -660,42 +677,6 @@ do_arch_sched_op: movl %eax,UREGS_eax(%ecx) jmp do_sched_op -do_switch_vm86: - # Reset the stack pointer - GET_GUEST_REGS(%ecx) - movl %ecx,%esp - - # GS:ESI == Ring-1 stack activation - movl UREGS_esp(%esp),%esi -VFLT1: mov UREGS_ss(%esp),%gs - - # ES:EDI == Ring-0 stack activation - leal UREGS_eip(%esp),%edi - - # Restore the hypercall-number-clobbered EAX on our stack frame -VFLT2: movl %gs:(%esi),%eax - movl %eax,UREGS_eax(%esp) - addl $4,%esi - - # Copy the VM86 activation from the ring-1 stack to the ring-0 stack - movl $(UREGS_user_sizeof-UREGS_eip)/4,%ecx -VFLT3: movl %gs:(%esi),%eax - stosl - addl $4,%esi - loop VFLT3 - - # Fix up EFLAGS: IOPL=0, IF=1, VM=1 - andl $~X86_EFLAGS_IOPL,UREGS_eflags(%esp) - orl $X86_EFLAGS_IF|X86_EFLAGS_VM,UREGS_eflags(%esp) - - jmp test_all_events - -.section __ex_table,"a" - .long VFLT1,domain_crash_synchronous - .long VFLT2,domain_crash_synchronous - .long VFLT3,domain_crash_synchronous -.previous - .data ENTRY(exception_table) @@ -744,11 +725,12 @@ ENTRY(hypercall_table) .long do_grant_table_op /* 20 */ .long do_vm_assist .long do_update_va_mapping_otherdomain - .long do_switch_vm86 + .long do_iret .long do_vcpu_op .long do_ni_hypercall /* 25 */ .long do_mmuext_op - .long do_acm_op /* 27 */ + .long do_acm_op + .long do_nmi_op .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr @@ -777,11 +759,12 @@ ENTRY(hypercall_args_table) .byte 3 /* do_grant_table_op */ /* 20 */ .byte 2 /* do_vm_assist */ .byte 5 /* do_update_va_mapping_otherdomain */ - .byte 0 /* do_switch_vm86 */ + .byte 0 /* do_iret */ .byte 3 /* do_vcpu_op */ .byte 0 /* do_ni_hypercall */ /* 25 */ .byte 4 /* do_mmuext_op */ .byte 1 /* do_acm_op */ + .byte 2 /* do_nmi_op */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c index 4be333f4cf..95def3f2b4 100644 --- a/xen/arch/x86/x86_32/mm.c +++ b/xen/arch/x86/x86_32/mm.c @@ -29,8 +29,6 @@ #include <asm/fixmap.h> #include <public/memory.h> -extern l1_pgentry_t *mapcache; - unsigned int PAGE_HYPERVISOR = __PAGE_HYPERVISOR; unsigned int PAGE_HYPERVISOR_NOCACHE = __PAGE_HYPERVISOR_NOCACHE; @@ -68,7 +66,7 @@ void __init paging_init(void) void *ioremap_pt; unsigned long v; struct pfn_info *pg; - int i, mapcache_order; + int i; #ifdef CONFIG_X86_PAE printk("PAE enabled, limit: %d GB\n", MACHPHYS_MBYTES); @@ -76,7 +74,7 @@ void __init paging_init(void) printk("PAE disabled.\n"); #endif - idle0_vcpu.arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); + idle_vcpu[0]->arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); if ( cpu_has_pge ) { @@ -121,14 +119,12 @@ void __init paging_init(void) l2e_from_page(virt_to_page(ioremap_pt), __PAGE_HYPERVISOR); } - /* Set up mapping cache for domain pages. */ - mapcache_order = get_order_from_bytes( - MAPCACHE_MBYTES << (20 - PAGETABLE_ORDER)); - mapcache = alloc_xenheap_pages(mapcache_order); - memset(mapcache, 0, PAGE_SIZE << mapcache_order); - for ( i = 0; i < (MAPCACHE_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) - idle_pg_table_l2[l2_linear_offset(MAPCACHE_VIRT_START) + i] = - l2e_from_page(virt_to_page(mapcache) + i, __PAGE_HYPERVISOR); + /* Install per-domain mappings for idle domain. */ + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + idle_pg_table_l2[l2_linear_offset(PERDOMAIN_VIRT_START) + i] = + l2e_from_page(virt_to_page(idle_vcpu[0]->domain-> + arch.mm_perdomain_pt) + i, + __PAGE_HYPERVISOR); } void __init zap_low_mappings(l2_pgentry_t *base) diff --git a/xen/arch/x86/x86_32/traps.c b/xen/arch/x86/x86_32/traps.c index cb2b7b9eaa..95b69a14bd 100644 --- a/xen/arch/x86/x86_32/traps.c +++ b/xen/arch/x86/x86_32/traps.c @@ -157,6 +157,64 @@ asmlinkage void do_double_fault(void) __asm__ __volatile__ ( "hlt" ); } +asmlinkage unsigned long do_iret(void) +{ + struct cpu_user_regs *regs = guest_cpu_user_regs(); + u32 eflags; + + /* Check worst-case stack frame for overlap with Xen protected area. */ + if ( unlikely(!access_ok(regs->esp, 40)) ) + domain_crash_synchronous(); + + /* Pop and restore EAX (clobbered by hypercall). */ + if ( unlikely(__copy_from_user(®s->eax, (void __user *)regs->esp, 4)) ) + domain_crash_synchronous(); + regs->esp += 4; + + /* Pop and restore CS and EIP. */ + if ( unlikely(__copy_from_user(®s->eip, (void __user *)regs->esp, 8)) ) + domain_crash_synchronous(); + regs->esp += 8; + + /* + * Pop, fix up and restore EFLAGS. We fix up in a local staging area + * to avoid firing the BUG_ON(IOPL) check in arch_getdomaininfo_ctxt. + */ + if ( unlikely(__copy_from_user(&eflags, (void __user *)regs->esp, 4)) ) + domain_crash_synchronous(); + regs->esp += 4; + regs->eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF; + + if ( VM86_MODE(regs) ) + { + /* Return to VM86 mode: pop and restore ESP,SS,ES,DS,FS and GS. */ + if ( __copy_from_user(®s->esp, (void __user *)regs->esp, 24) ) + domain_crash_synchronous(); + } + else if ( unlikely(RING_0(regs)) ) + { + domain_crash_synchronous(); + } + else if ( !RING_1(regs) ) + { + /* Return to ring 2/3: pop and restore ESP and SS. */ + if ( __copy_from_user(®s->esp, (void __user *)regs->esp, 8) ) + domain_crash_synchronous(); + } + + /* No longer in NMI context. */ + clear_bit(_VCPUF_nmi_masked, ¤t->vcpu_flags); + + /* Restore upcall mask from saved value. */ + current->vcpu_info->evtchn_upcall_mask = regs->saved_upcall_mask; + + /* + * The hypercall exit path will overwrite EAX with this return + * value. + */ + return regs->eax; +} + BUILD_SMP_INTERRUPT(deferred_nmi, TRAP_deferred_nmi) asmlinkage void smp_deferred_nmi(struct cpu_user_regs regs) { diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c index c7a3e6025c..0aa20ccabb 100644 --- a/xen/arch/x86/x86_64/asm-offsets.c +++ b/xen/arch/x86/x86_64/asm-offsets.c @@ -65,6 +65,10 @@ void __dummy__(void) arch.guest_context.syscall_callback_eip); OFFSET(VCPU_kernel_sp, struct vcpu, arch.guest_context.kernel_sp); + OFFSET(VCPU_flags, struct vcpu, vcpu_flags); + OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr); + DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending); + DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked); BLANK(); OFFSET(VCPUINFO_upcall_pending, vcpu_info_t, evtchn_upcall_pending); diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S index 3c5c344a1a..88fe273bab 100644 --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -171,7 +171,9 @@ test_all_events: leaq irq_stat(%rip),%rcx testl $~0,(%rcx,%rax,1) jnz process_softirqs -/*test_guest_events:*/ + btr $_VCPUF_nmi_pending,VCPU_flags(%rbx) + jc process_nmi +test_guest_events: movq VCPU_vcpu_info(%rbx),%rax testb $0xFF,VCPUINFO_upcall_mask(%rax) jnz restore_all_guest @@ -322,6 +324,23 @@ process_softirqs: call do_softirq jmp test_all_events + ALIGN +/* %rbx: struct vcpu */ +process_nmi: + movq VCPU_nmi_addr(%rbx),%rax + test %rax,%rax + jz test_all_events + bts $_VCPUF_nmi_masked,VCPU_flags(%rbx) + jc 1f + sti + leaq VCPU_trap_bounce(%rbx),%rdx + movq %rax,TRAPBOUNCE_eip(%rdx) + movw $(TBF_INTERRUPT|TBF_SLOW_IRET),TRAPBOUNCE_flags(%rdx) + call create_bounce_frame + jmp test_all_events +1: bts $_VCPUF_nmi_pending,VCPU_flags(%rbx) + jmp test_guest_events + /* CREATE A BASIC EXCEPTION FRAME ON GUEST OS STACK: */ /* { RCX, R11, [DS-GS,] [CR2,] [ERRCODE,] RIP, CS, RFLAGS, RSP, SS } */ /* %rdx: trap_bounce, %rbx: struct vcpu */ @@ -339,6 +358,9 @@ create_bounce_frame: 1: /* In kernel context already: push new frame at existing %rsp. */ movq UREGS_rsp+8(%rsp),%rsi andb $0xfc,UREGS_cs+8(%rsp) # Indicate kernel context to guest. + testw $(TBF_SLOW_IRET),TRAPBOUNCE_flags(%rdx) + jz 2f + orb $0x01,UREGS_cs+8(%rsp) 2: andq $~0xf,%rsi # Stack frames are 16-byte aligned. movq $HYPERVISOR_VIRT_START,%rax cmpq %rax,%rsi @@ -569,7 +591,7 @@ ENTRY(nmi) SAVE_ALL movq %rsp,%rdi call do_nmi - jmp restore_all_xen + jmp ret_from_intr do_arch_sched_op: # Ensure we return success even if we return via schedule_tail() @@ -626,11 +648,12 @@ ENTRY(hypercall_table) .quad do_grant_table_op /* 20 */ .quad do_vm_assist .quad do_update_va_mapping_otherdomain - .quad do_switch_to_user + .quad do_iret .quad do_vcpu_op .quad do_set_segment_base /* 25 */ .quad do_mmuext_op .quad do_acm_op + .quad do_nmi_op .rept NR_hypercalls-((.-hypercall_table)/4) .quad do_ni_hypercall .endr @@ -659,11 +682,12 @@ ENTRY(hypercall_args_table) .byte 3 /* do_grant_table_op */ /* 20 */ .byte 2 /* do_vm_assist */ .byte 4 /* do_update_va_mapping_otherdomain */ - .byte 0 /* do_switch_to_user */ + .byte 0 /* do_iret */ .byte 3 /* do_vcpu_op */ .byte 2 /* do_set_segment_base */ /* 25 */ .byte 4 /* do_mmuext_op */ .byte 1 /* do_acm_op */ + .byte 2 /* do_nmi_op */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 08e0f88bb8..085fb4d22e 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -80,7 +80,7 @@ void __init paging_init(void) l2_pgentry_t *l2_ro_mpt; struct pfn_info *pg; - idle0_vcpu.arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); + idle_vcpu[0]->arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); /* Create user-accessible L2 directory to map the MPT for guests. */ l3_ro_mpt = alloc_xenheap_page(); @@ -119,6 +119,12 @@ void __init paging_init(void) /* Set up linear page table mapping. */ idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)] = l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR); + + /* Install per-domain mappings for idle domain. */ + idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)] = + l4e_from_page( + virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3), + __PAGE_HYPERVISOR); } void __init zap_low_mappings(void) diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index 4f7c822ef8..9756c54589 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -12,6 +12,7 @@ #include <asm/current.h> #include <asm/flushtlb.h> #include <asm/msr.h> +#include <asm/shadow.h> #include <asm/vmx.h> void show_registers(struct cpu_user_regs *regs) @@ -113,6 +114,52 @@ asmlinkage void do_double_fault(struct cpu_user_regs *regs) __asm__ __volatile__ ( "hlt" ); } +void toggle_guest_mode(struct vcpu *v) +{ + v->arch.flags ^= TF_kernel_mode; + __asm__ __volatile__ ( "swapgs" ); + update_pagetables(v); + write_ptbase(v); +} + +long do_iret(void) +{ + struct cpu_user_regs *regs = guest_cpu_user_regs(); + struct iret_context iret_saved; + struct vcpu *v = current; + + if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp, + sizeof(iret_saved))) ) + domain_crash_synchronous(); + + /* Returning to user mode? */ + if ( (iret_saved.cs & 3) == 3 ) + { + if ( unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) ) + return -EFAULT; + toggle_guest_mode(v); + } + + regs->rip = iret_saved.rip; + regs->cs = iret_saved.cs | 3; /* force guest privilege */ + regs->rflags = (iret_saved.rflags & ~(EF_IOPL|EF_VM)) | EF_IE; + regs->rsp = iret_saved.rsp; + regs->ss = iret_saved.ss | 3; /* force guest privilege */ + + if ( !(iret_saved.flags & VGCF_IN_SYSCALL) ) + { + regs->entry_vector = 0; + regs->r11 = iret_saved.r11; + regs->rcx = iret_saved.rcx; + } + + /* No longer in NMI context. */ + clear_bit(_VCPUF_nmi_masked, ¤t->vcpu_flags); + + /* Saved %rax gets written back to regs->rax in entry.S. */ + return iret_saved.rax; +} + asmlinkage void syscall_enter(void); void __init percpu_traps_init(void) { |