aboutsummaryrefslogtreecommitdiffstats
path: root/xen/arch
diff options
context:
space:
mode:
Diffstat (limited to 'xen/arch')
-rw-r--r--xen/arch/ia64/linux-xen/process-linux-xen.c2
-rw-r--r--xen/arch/ia64/linux-xen/smpboot.c6
-rw-r--r--xen/arch/ia64/vmx/mmio.c2
-rw-r--r--xen/arch/ia64/vmx/vlsapic.c22
-rw-r--r--xen/arch/ia64/vmx/vmx_init.c2
-rw-r--r--xen/arch/ia64/vmx/vmx_process.c2
-rw-r--r--xen/arch/ia64/vmx/vmx_support.c2
-rw-r--r--xen/arch/ia64/xen/domain.c73
-rw-r--r--xen/arch/ia64/xen/idle0_task.c31
-rw-r--r--xen/arch/ia64/xen/process.c30
-rw-r--r--xen/arch/ia64/xen/vcpu.c2
-rw-r--r--xen/arch/ia64/xen/xenmisc.c9
-rw-r--r--xen/arch/ia64/xen/xensetup.c38
-rw-r--r--xen/arch/ia64/xen/xentime.c6
-rw-r--r--xen/arch/x86/apic.c4
-rw-r--r--xen/arch/x86/boot/x86_32.S4
-rw-r--r--xen/arch/x86/dm/i8259.c2
-rw-r--r--xen/arch/x86/dm/vmx_vioapic.c2
-rw-r--r--xen/arch/x86/dom0_ops.c23
-rw-r--r--xen/arch/x86/domain.c209
-rw-r--r--xen/arch/x86/domain_build.c17
-rw-r--r--xen/arch/x86/idle0_task.c27
-rw-r--r--xen/arch/x86/io_apic.c44
-rw-r--r--xen/arch/x86/irq.c81
-rw-r--r--xen/arch/x86/mm.c80
-rw-r--r--xen/arch/x86/nmi.c76
-rw-r--r--xen/arch/x86/setup.c49
-rw-r--r--xen/arch/x86/shadow.c23
-rw-r--r--xen/arch/x86/shadow32.c41
-rw-r--r--xen/arch/x86/shadow_public.c25
-rw-r--r--xen/arch/x86/smpboot.c13
-rw-r--r--xen/arch/x86/time.c24
-rw-r--r--xen/arch/x86/traps.c85
-rw-r--r--xen/arch/x86/vmx.c33
-rw-r--r--xen/arch/x86/vmx_intercept.c16
-rw-r--r--xen/arch/x86/vmx_io.c4
-rw-r--r--xen/arch/x86/vmx_platform.c2
-rw-r--r--xen/arch/x86/vmx_vlapic.c12
-rw-r--r--xen/arch/x86/vmx_vmcs.c96
-rw-r--r--xen/arch/x86/x86_32/asm-offsets.c4
-rw-r--r--xen/arch/x86/x86_32/domain_page.c232
-rw-r--r--xen/arch/x86/x86_32/entry.S71
-rw-r--r--xen/arch/x86/x86_32/mm.c20
-rw-r--r--xen/arch/x86/x86_32/traps.c58
-rw-r--r--xen/arch/x86/x86_64/asm-offsets.c4
-rw-r--r--xen/arch/x86/x86_64/entry.S32
-rw-r--r--xen/arch/x86/x86_64/mm.c8
-rw-r--r--xen/arch/x86/x86_64/traps.c47
48 files changed, 1009 insertions, 686 deletions
diff --git a/xen/arch/ia64/linux-xen/process-linux-xen.c b/xen/arch/ia64/linux-xen/process-linux-xen.c
index b02187ad8c..0f7b403dca 100644
--- a/xen/arch/ia64/linux-xen/process-linux-xen.c
+++ b/xen/arch/ia64/linux-xen/process-linux-xen.c
@@ -241,7 +241,7 @@ static inline void play_dead(void)
max_xtp();
local_irq_disable();
- idle_task_exit();
+ idle_domain_exit();
ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]);
/*
* The above is a point of no-return, the processor is
diff --git a/xen/arch/ia64/linux-xen/smpboot.c b/xen/arch/ia64/linux-xen/smpboot.c
index 89f6829648..c6970ffad9 100644
--- a/xen/arch/ia64/linux-xen/smpboot.c
+++ b/xen/arch/ia64/linux-xen/smpboot.c
@@ -482,10 +482,8 @@ do_rest:
struct vcpu *v;
void *stack;
- if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL )
- panic("failed 'createdomain' for CPU %d", cpu);
- set_bit(_DOMF_idle_domain, &idle->domain_flags);
- v = idle->vcpu[0];
+ v = idle_vcpu[cpu] = alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
+ BUG_ON(v == NULL);
printf ("do_boot_cpu: cpu=%d, domain=%p, vcpu=%p\n", cpu, idle, v);
diff --git a/xen/arch/ia64/vmx/mmio.c b/xen/arch/ia64/vmx/mmio.c
index b3668acb81..342fa87621 100644
--- a/xen/arch/ia64/vmx/mmio.c
+++ b/xen/arch/ia64/vmx/mmio.c
@@ -29,7 +29,7 @@
#include <asm/vmx_vcpu.h>
#include <asm/privop.h>
#include <asm/types.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <asm/mm.h>
#include <asm/vmx.h>
diff --git a/xen/arch/ia64/vmx/vlsapic.c b/xen/arch/ia64/vmx/vlsapic.c
index 032d3633b1..93dabc168b 100644
--- a/xen/arch/ia64/vmx/vlsapic.c
+++ b/xen/arch/ia64/vmx/vlsapic.c
@@ -119,7 +119,7 @@ void vtm_init(VCPU *vcpu)
itc_freq = local_cpu_data->itc_freq;
vtm->cfg_max_jump=itc_freq*MAX_JUMP_STEP/1000;
vtm->cfg_min_grun=itc_freq*MIN_GUEST_RUNNING_TIME/1000;
- init_ac_timer(&vtm->vtm_timer, vtm_timer_fn, vcpu, 0);
+ init_timer(&vtm->vtm_timer, vtm_timer_fn, vcpu, 0);
vtm_reset(vcpu);
}
@@ -163,20 +163,20 @@ void vtm_set_itv(VCPU *vcpu)
local_irq_save(spsr);
itv = VCPU(vcpu, itv);
if ( ITV_IRQ_MASK(itv) )
- rem_ac_timer(&vtm->vtm_timer);
+ stop_timer(&vtm->vtm_timer);
vtm_interruption_update(vcpu, vtm);
local_irq_restore(spsr);
}
/*
- * Update interrupt or hook the vtm ac_timer for fire
+ * Update interrupt or hook the vtm timer for fire
* At this point vtm_timer should be removed if itv is masked.
*/
/* Interrupt must be disabled at this point */
extern u64 cycle_to_ns(u64 cyle);
-#define TIMER_SLOP (50*1000) /* ns */ /* copy from ac_timer.c */
+#define TIMER_SLOP (50*1000) /* ns */ /* copy from timer.c */
void vtm_interruption_update(VCPU *vcpu, vtime_t* vtm)
{
uint64_t cur_itc,vitm,vitv;
@@ -198,7 +198,7 @@ void vtm_interruption_update(VCPU *vcpu, vtime_t* vtm)
if ( diff_last >= 0 ) {
// interrupt already fired.
- rem_ac_timer(&vtm->vtm_timer);
+ stop_timer(&vtm->vtm_timer);
}
else if ( diff_now >= 0 ) {
// ITV is fired.
@@ -207,30 +207,30 @@ void vtm_interruption_update(VCPU *vcpu, vtime_t* vtm)
/* Both last_itc & cur_itc < itm, wait for fire condition */
else {
expires = NOW() + cycle_to_ns(0-diff_now) + TIMER_SLOP;
- set_ac_timer(&vtm->vtm_timer, expires);
+ set_timer(&vtm->vtm_timer, expires);
}
local_irq_restore(spsr);
}
/*
* Action for vtm when the domain is scheduled out.
- * Remove the ac_timer for vtm.
+ * Remove the timer for vtm.
*/
void vtm_domain_out(VCPU *vcpu)
{
- if(!is_idle_task(vcpu->domain))
- rem_ac_timer(&vcpu->arch.arch_vmx.vtm.vtm_timer);
+ if(!is_idle_domain(vcpu->domain))
+ stop_timer(&vcpu->arch.arch_vmx.vtm.vtm_timer);
}
/*
* Action for vtm when the domain is scheduled in.
- * Fire vtm IRQ or add the ac_timer for vtm.
+ * Fire vtm IRQ or add the timer for vtm.
*/
void vtm_domain_in(VCPU *vcpu)
{
vtime_t *vtm;
- if(!is_idle_task(vcpu->domain)) {
+ if(!is_idle_domain(vcpu->domain)) {
vtm=&(vcpu->arch.arch_vmx.vtm);
vtm_interruption_update(vcpu, vtm);
}
diff --git a/xen/arch/ia64/vmx/vmx_init.c b/xen/arch/ia64/vmx/vmx_init.c
index 0920b8c14c..788b7bc1a0 100644
--- a/xen/arch/ia64/vmx/vmx_init.c
+++ b/xen/arch/ia64/vmx/vmx_init.c
@@ -42,7 +42,7 @@
#include <xen/lib.h>
#include <asm/vmmu.h>
#include <public/arch-ia64.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <asm/vmx_phy_mode.h>
#include <asm/processor.h>
#include <asm/vmx.h>
diff --git a/xen/arch/ia64/vmx/vmx_process.c b/xen/arch/ia64/vmx/vmx_process.c
index a5fdcf87b3..99e4770091 100644
--- a/xen/arch/ia64/vmx/vmx_process.c
+++ b/xen/arch/ia64/vmx/vmx_process.c
@@ -231,7 +231,7 @@ void leave_hypervisor_tail(struct pt_regs *regs)
struct domain *d = current->domain;
struct vcpu *v = current;
// FIXME: Will this work properly if doing an RFI???
- if (!is_idle_task(d) ) { // always comes from guest
+ if (!is_idle_domain(d) ) { // always comes from guest
extern void vmx_dorfirfi(void);
struct pt_regs *user_regs = vcpu_regs(current);
if (local_softirq_pending())
diff --git a/xen/arch/ia64/vmx/vmx_support.c b/xen/arch/ia64/vmx/vmx_support.c
index 19ea7be6de..801eba6cf8 100644
--- a/xen/arch/ia64/vmx/vmx_support.c
+++ b/xen/arch/ia64/vmx/vmx_support.c
@@ -21,7 +21,7 @@
*/
#include <xen/config.h>
#include <xen/sched.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <asm/vmx.h>
#include <asm/vmx_vcpu.h>
diff --git a/xen/arch/ia64/xen/domain.c b/xen/arch/ia64/xen/domain.c
index 8c9dbada27..99b1646614 100644
--- a/xen/arch/ia64/xen/domain.c
+++ b/xen/arch/ia64/xen/domain.c
@@ -46,7 +46,7 @@
#include <asm/vmx_vcpu.h>
#include <asm/vmx_vpd.h>
#include <asm/pal.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#define CONFIG_DOMAIN0_CONTIGUOUS
unsigned long dom0_start = -1L;
@@ -87,7 +87,6 @@ static void continue_cpu_idle_loop(void)
int cpu = smp_processor_id();
for ( ; ; )
{
- printf ("idle%dD\n", cpu);
#ifdef IA64
// __IRQ_STAT(cpu, idle_timestamp) = jiffies
#else
@@ -146,15 +145,26 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
{
struct vcpu *v;
+ /* Still keep idle vcpu0 static allocated at compilation, due
+ * to some code from Linux still requires it in early phase.
+ */
+ if (is_idle_domain(d) && !vcpu_id)
+ return idle_vcpu[0];
+
if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
return NULL;
memset(v, 0, sizeof(*v));
- memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
- v->arch.privregs =
+ memcpy(&v->arch, &idle_vcpu[0]->arch, sizeof(v->arch));
+
+ if (!is_idle_domain(d)) {
+ v->arch.privregs =
alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
+ BUG_ON(v->arch.privregs == NULL);
+ memset(v->arch.privregs, 0, PAGE_SIZE);
+ }
+
printf("arch_vcpu_info=%p\n", v->arch.privregs);
- memset(v->arch.privregs, 0, PAGE_SIZE);
return v;
}
@@ -191,6 +201,14 @@ int arch_do_createdomain(struct vcpu *v)
memset(ti, 0, sizeof(struct thread_info));
init_switch_stack(v);
+ // the following will eventually need to be negotiated dynamically
+ d->xen_vastart = XEN_START_ADDR;
+ d->xen_vaend = XEN_END_ADDR;
+ d->shared_info_va = SHAREDINFO_ADDR;
+
+ if (is_idle_vcpu(v))
+ return 0;
+
d->shared_info = (void *)alloc_xenheap_page();
if (!d->shared_info) {
printk("ERROR/HALTING: CAN'T ALLOC PAGE\n");
@@ -200,12 +218,7 @@ int arch_do_createdomain(struct vcpu *v)
if (v == d->vcpu[0])
memset(&d->shared_info->evtchn_mask[0], 0xff,
sizeof(d->shared_info->evtchn_mask));
-#if 0
- d->vcpu[0].arch.privregs =
- alloc_xenheap_pages(get_order(sizeof(mapped_regs_t)));
- printf("arch_vcpu_info=%p\n", d->vcpu[0].arch.privregs);
- memset(d->vcpu.arch.privregs, 0, PAGE_SIZE);
-#endif
+
v->vcpu_info = &(d->shared_info->vcpu_info[0]);
d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
@@ -227,28 +240,21 @@ int arch_do_createdomain(struct vcpu *v)
BUG();
v->arch.starting_rid = d->arch.starting_rid;
v->arch.ending_rid = d->arch.ending_rid;
- // the following will eventually need to be negotiated dynamically
- d->xen_vastart = XEN_START_ADDR;
- d->xen_vaend = XEN_END_ADDR;
- d->shared_info_va = SHAREDINFO_ADDR;
d->arch.breakimm = 0x1000;
v->arch.breakimm = d->arch.breakimm;
d->arch.sys_pgnr = 0;
- if (d->domain_id != IDLE_DOMAIN_ID) {
- d->arch.mm = xmalloc(struct mm_struct);
- if (unlikely(!d->arch.mm)) {
- printk("Can't allocate mm_struct for domain %d\n",d->domain_id);
- return -ENOMEM;
- }
- memset(d->arch.mm, 0, sizeof(*d->arch.mm));
- d->arch.mm->pgd = pgd_alloc(d->arch.mm);
- if (unlikely(!d->arch.mm->pgd)) {
- printk("Can't allocate pgd for domain %d\n",d->domain_id);
- return -ENOMEM;
- }
- } else
- d->arch.mm = NULL;
+ d->arch.mm = xmalloc(struct mm_struct);
+ if (unlikely(!d->arch.mm)) {
+ printk("Can't allocate mm_struct for domain %d\n",d->domain_id);
+ return -ENOMEM;
+ }
+ memset(d->arch.mm, 0, sizeof(*d->arch.mm));
+ d->arch.mm->pgd = pgd_alloc(d->arch.mm);
+ if (unlikely(!d->arch.mm->pgd)) {
+ printk("Can't allocate pgd for domain %d\n",d->domain_id);
+ return -ENOMEM;
+ }
printf ("arch_do_create_domain: domain=%p\n", d);
return 0;
@@ -1070,15 +1076,6 @@ void domain_pend_keyboard_interrupt(int irq)
vcpu_pend_interrupt(dom0->vcpu[0],irq);
}
-void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
-{
- if ( v->processor == newcpu )
- return;
-
- set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
- v->processor = newcpu;
-}
-
void sync_vcpu_execstate(struct vcpu *v)
{
ia64_save_fpu(v->arch._thread.fph);
diff --git a/xen/arch/ia64/xen/idle0_task.c b/xen/arch/ia64/xen/idle0_task.c
index bfb49f7591..bc7aeed28a 100644
--- a/xen/arch/ia64/xen/idle0_task.c
+++ b/xen/arch/ia64/xen/idle0_task.c
@@ -11,31 +11,15 @@
.mmlist = LIST_HEAD_INIT(name.mmlist), \
}
-#define IDLE0_EXEC_DOMAIN(_ed,_d) \
+#define IDLE_VCPU(_v) \
{ \
processor: 0, \
- mm: 0, \
- thread: INIT_THREAD, \
- domain: (_d) \
-}
-
-#define IDLE0_DOMAIN(_t) \
-{ \
- domain_id: IDLE_DOMAIN_ID, \
- domain_flags:DOMF_idle_domain, \
- refcnt: ATOMIC_INIT(1) \
+ domain: 0 \
}
struct mm_struct init_mm = INIT_MM(init_mm);
EXPORT_SYMBOL(init_mm);
-struct domain idle0_domain = IDLE0_DOMAIN(idle0_domain);
-#if 0
-struct vcpu idle0_vcpu = IDLE0_EXEC_DOMAIN(idle0_vcpu,
- &idle0_domain);
-#endif
-
-
/*
* Initial task structure.
*
@@ -44,15 +28,12 @@ struct vcpu idle0_vcpu = IDLE0_EXEC_DOMAIN(idle0_vcpu,
*/
union {
struct {
- struct domain task;
+ struct vcpu task;
} s;
unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)];
-} init_task_mem asm ("init_task") __attribute__((section(".data.init_task")));
-// = {{
- ;
-//.task = IDLE0_EXEC_DOMAIN(init_task_mem.s.task,&idle0_domain),
-//};
-//};
+} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{
+ .task = IDLE_VCPU(init_task_mem.s.task)
+}};
EXPORT_SYMBOL(init_task);
diff --git a/xen/arch/ia64/xen/process.c b/xen/arch/ia64/xen/process.c
index ad5be5ba81..e1da875cdb 100644
--- a/xen/arch/ia64/xen/process.c
+++ b/xen/arch/ia64/xen/process.c
@@ -65,26 +65,16 @@ long do_iopl(domid_t domain, unsigned int new_io_pl)
extern struct schedule_data schedule_data[NR_CPUS];
-void schedule_tail(struct vcpu *next)
+void schedule_tail(struct vcpu *prev)
{
- unsigned long rr7;
- //printk("current=%lx,shared_info=%lx\n",current,current->vcpu_info);
- //printk("next=%lx,shared_info=%lx\n",next,next->vcpu_info);
-
- // TG: Real HACK FIXME.
- // This is currently necessary because when a new domain is started,
- // the context_switch function of xen/common/schedule.c(__enter_scheduler)
- // never returns. Therefore, the lock must be released.
- // schedule_tail is only called when a domain is started.
- spin_unlock_irq(&schedule_data[current->processor].schedule_lock);
-
- /* rr7 will be postponed to last point when resuming back to guest */
- if(VMX_DOMAIN(current)){
- vmx_load_all_rr(current);
- }else{
- load_region_regs(current);
- vcpu_load_kernel_regs(current);
- }
+ context_saved(prev);
+
+ if (VMX_DOMAIN(current)) {
+ vmx_load_all_rr(current);
+ } else {
+ load_region_regs(current);
+ vcpu_load_kernel_regs(current);
+ }
}
void tdpfoo(void) { }
@@ -252,7 +242,7 @@ void deliver_pending_interrupt(struct pt_regs *regs)
struct domain *d = current->domain;
struct vcpu *v = current;
// FIXME: Will this work properly if doing an RFI???
- if (!is_idle_task(d) && user_mode(regs)) {
+ if (!is_idle_domain(d) && user_mode(regs)) {
//vcpu_poke_timer(v);
if (vcpu_deliverable_interrupts(v))
reflect_extint(regs);
diff --git a/xen/arch/ia64/xen/vcpu.c b/xen/arch/ia64/xen/vcpu.c
index 2d62bdf86e..4e56524dd9 100644
--- a/xen/arch/ia64/xen/vcpu.c
+++ b/xen/arch/ia64/xen/vcpu.c
@@ -1085,7 +1085,7 @@ void vcpu_set_next_timer(VCPU *vcpu)
/* gloss over the wraparound problem for now... we know it exists
* but it doesn't matter right now */
- if (is_idle_task(vcpu->domain)) {
+ if (is_idle_domain(vcpu->domain)) {
// printf("****** vcpu_set_next_timer called during idle!!\n");
vcpu_safe_set_itm(s);
return;
diff --git a/xen/arch/ia64/xen/xenmisc.c b/xen/arch/ia64/xen/xenmisc.c
index cb8349e1d3..eaddbee41c 100644
--- a/xen/arch/ia64/xen/xenmisc.c
+++ b/xen/arch/ia64/xen/xenmisc.c
@@ -75,7 +75,7 @@ struct pt_regs *guest_cpu_user_regs(void) { return vcpu_regs(current); }
void raise_actimer_softirq(void)
{
- raise_softirq(AC_TIMER_SOFTIRQ);
+ raise_softirq(TIMER_SOFTIRQ);
}
unsigned long
@@ -320,18 +320,15 @@ if (!i--) { printk("+",id); i = 1000000; }
ia64_set_iva(&ia64_ivt);
ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
VHPT_ENABLED);
- if (!is_idle_task(current->domain)) {
+ if (!is_idle_domain(current->domain)) {
load_region_regs(current);
vcpu_load_kernel_regs(current);
if (vcpu_timer_expired(current)) vcpu_pend_timer(current);
}
if (vcpu_timer_expired(current)) vcpu_pend_timer(current);
}
-}
-void context_switch_finalise(struct vcpu *next)
-{
- /* nothing to do */
+ context_saved(prev);
}
void continue_running(struct vcpu *same)
diff --git a/xen/arch/ia64/xen/xensetup.c b/xen/arch/ia64/xen/xensetup.c
index 79d33cdc53..1b9dc72b75 100644
--- a/xen/arch/ia64/xen/xensetup.c
+++ b/xen/arch/ia64/xen/xensetup.c
@@ -27,7 +27,7 @@ unsigned long xenheap_phys_end;
char saved_command_line[COMMAND_LINE_SIZE];
-struct vcpu *idle_task[NR_CPUS] = { &idle0_vcpu };
+struct vcpu *idle_vcpu[NR_CPUS];
cpumask_t cpu_present_map;
@@ -157,16 +157,12 @@ void start_kernel(void)
unsigned long dom0_memory_start, dom0_memory_size;
unsigned long dom0_initrd_start, dom0_initrd_size;
unsigned long initial_images_start, initial_images_end;
+ struct domain *idle_domain;
running_on_sim = is_platform_hp_ski();
/* Kernel may be relocated by EFI loader */
xen_pstart = ia64_tpa(KERNEL_START);
- /* Must do this early -- e.g., spinlocks rely on get_current(). */
- //set_current(&idle0_vcpu);
- ia64_r13 = (void *)&idle0_vcpu;
- idle0_vcpu.domain = &idle0_domain;
-
early_setup_arch(&cmdline);
/* We initialise the serial devices very early so we can get debugging. */
@@ -282,18 +278,22 @@ void start_kernel(void)
(xenheap_phys_end-__pa(heap_start)) >> 20,
(xenheap_phys_end-__pa(heap_start)) >> 10);
+printk("About to call scheduler_init()\n");
+ scheduler_init();
+ idle_vcpu[0] = (struct vcpu*) ia64_r13;
+ idle_domain = do_createdomain(IDLE_DOMAIN_ID, 0);
+ BUG_ON(idle_domain == NULL);
+
late_setup_arch(&cmdline);
setup_per_cpu_areas();
mem_init();
-printk("About to call scheduler_init()\n");
- scheduler_init();
local_irq_disable();
init_IRQ ();
printk("About to call init_xen_time()\n");
init_xen_time(); /* initialise the time */
-printk("About to call ac_timer_init()\n");
- ac_timer_init();
+printk("About to call timer_init()\n");
+ timer_init();
#ifdef CONFIG_XEN_CONSOLE_INPUT /* CONFIG_SERIAL_8250_CONSOLE=n in dom0! */
initialize_keytable();
@@ -309,14 +309,10 @@ printk("About to call ac_timer_init()\n");
}
smp_prepare_cpus(max_cpus);
-
/* We aren't hotplug-capable yet. */
- //BUG_ON(!cpus_empty(cpu_present_map));
for_each_cpu ( i )
cpu_set(i, cpu_present_map);
- //BUG_ON(!local_irq_is_enabled());
-
/* Enable IRQ to receive IPI (needed for ITC sync). */
local_irq_enable();
@@ -345,12 +341,7 @@ printk("About to call sort_main_extable()\n");
/* Create initial domain 0. */
printk("About to call do_createdomain()\n");
dom0 = do_createdomain(0, 0);
- init_task.domain = &idle0_domain;
- init_task.processor = 0;
-// init_task.mm = &init_mm;
- init_task.domain->arch.mm = &init_mm;
-// init_task.thread = INIT_THREAD;
- //arch_do_createdomain(current);
+
#ifdef CLONE_DOMAIN0
{
int i;
@@ -383,8 +374,7 @@ printk("About to call do_createdomain()\n");
panic("Could not set up DOM0 guest OS\n");
/* PIN domain0 on CPU 0. */
- dom0->vcpu[0]->cpumap=1;
- set_bit(_VCPUF_cpu_pinned, &dom0->vcpu[0]->vcpu_flags);
+ dom0->vcpu[0]->cpu_affinity = cpumask_of_cpu(0);
#ifdef CLONE_DOMAIN0
{
@@ -433,8 +423,8 @@ printk("About to call init_trace_bufs()\n");
local_irq_enable();
- printf("About to call schedulers_start dom0=%p, idle0_dom=%p\n",
- dom0, &idle0_domain);
+ printf("About to call schedulers_start dom0=%p, idle_dom=%p\n",
+ dom0, &idle_domain);
schedulers_start();
domain_unpause_by_systemcontroller(dom0);
diff --git a/xen/arch/ia64/xen/xentime.c b/xen/arch/ia64/xen/xentime.c
index 1b15fb12c7..f407509f3c 100644
--- a/xen/arch/ia64/xen/xentime.c
+++ b/xen/arch/ia64/xen/xentime.c
@@ -127,7 +127,7 @@ xen_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
vcpu_wake(dom0->vcpu[0]);
}
}
- if (!is_idle_task(current->domain)) {
+ if (!is_idle_domain(current->domain)) {
if (vcpu_timer_expired(current)) {
vcpu_pend_timer(current);
// ensure another timer interrupt happens even if domain doesn't
@@ -196,7 +196,7 @@ xen_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
//#endif
/* double check, in case we got hit by a (slow) PMI: */
} while (time_after_eq(ia64_get_itc(), new_itm));
- raise_softirq(AC_TIMER_SOFTIRQ);
+ raise_softirq(TIMER_SOFTIRQ);
return IRQ_HANDLED;
}
@@ -235,7 +235,7 @@ int __init init_xen_time()
return 0;
}
-int reprogram_ac_timer(s_time_t timeout)
+int reprogram_timer(s_time_t timeout)
{
struct vcpu *v = current;
s_time_t expire;
diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c
index 1a3d5f591e..7eb6000b37 100644
--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -870,7 +870,7 @@ void enable_APIC_timer(void)
* returns 1 on success
* returns 0 if the timeout value is too small or in the past.
*/
-int reprogram_ac_timer(s_time_t timeout)
+int reprogram_timer(s_time_t timeout)
{
s_time_t now;
s_time_t expire;
@@ -931,7 +931,7 @@ void smp_apic_timer_interrupt(struct cpu_user_regs * regs)
{
ack_APIC_irq();
perfc_incrc(apic_timer);
- raise_softirq(AC_TIMER_SOFTIRQ);
+ raise_softirq(TIMER_SOFTIRQ);
}
/*
diff --git a/xen/arch/x86/boot/x86_32.S b/xen/arch/x86/boot/x86_32.S
index b98e1c72bc..5534b2621b 100644
--- a/xen/arch/x86/boot/x86_32.S
+++ b/xen/arch/x86/boot/x86_32.S
@@ -100,7 +100,7 @@ __start:
1: stosl /* low mappings cover as much physmem as possible */
add $4,%edi
add $(1<<L2_PAGETABLE_SHIFT),%eax
- cmp $__HYPERVISOR_VIRT_START+0xe3,%eax
+ cmp $HYPERVISOR_VIRT_START+0xe3,%eax
jne 1b
#else
/* Initialize low and high mappings of all memory with 4MB pages */
@@ -113,7 +113,7 @@ __start:
jne 1b
1: stosl /* low mappings cover as much physmem as possible */
add $(1<<L2_PAGETABLE_SHIFT),%eax
- cmp $__HYPERVISOR_VIRT_START+0xe3,%eax
+ cmp $HYPERVISOR_VIRT_START+0xe3,%eax
jne 1b
#endif
diff --git a/xen/arch/x86/dm/i8259.c b/xen/arch/x86/dm/i8259.c
index c0d735dc52..8a27835e9f 100644
--- a/xen/arch/x86/dm/i8259.c
+++ b/xen/arch/x86/dm/i8259.c
@@ -29,7 +29,7 @@
#include <xen/lib.h>
#include <xen/errno.h>
#include <xen/sched.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <asm/vmx.h>
#include <asm/vmx_vpic.h>
#include <asm/current.h>
diff --git a/xen/arch/x86/dm/vmx_vioapic.c b/xen/arch/x86/dm/vmx_vioapic.c
index 769eb59f22..201788e858 100644
--- a/xen/arch/x86/dm/vmx_vioapic.c
+++ b/xen/arch/x86/dm/vmx_vioapic.c
@@ -37,7 +37,7 @@
#include <xen/lib.h>
#include <xen/errno.h>
#include <xen/sched.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <asm/vmx.h>
#include <asm/vmx_vpic.h>
#include <asm/current.h>
diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c
index 5a4f493ce0..1ee7efd37b 100644
--- a/xen/arch/x86/dom0_ops.c
+++ b/xen/arch/x86/dom0_ops.c
@@ -36,13 +36,13 @@ static unsigned long msr_hi;
static void write_msr_for(void *unused)
{
- if ( ((1 << current->processor) & msr_cpu_mask) )
+ if ( ((1 << smp_processor_id()) & msr_cpu_mask) )
(void)wrmsr_user(msr_addr, msr_lo, msr_hi);
}
static void read_msr_for(void *unused)
{
- if ( ((1 << current->processor) & msr_cpu_mask) )
+ if ( ((1 << smp_processor_id()) & msr_cpu_mask) )
(void)rdmsr_user(msr_addr, msr_lo, msr_hi);
}
@@ -103,12 +103,27 @@ long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op)
op->u.add_memtype.nr_pfns,
op->u.add_memtype.type,
1);
+ if (ret > 0)
+ {
+ (void)__put_user(0, &u_dom0_op->u.add_memtype.handle);
+ (void)__put_user(ret, &u_dom0_op->u.add_memtype.reg);
+ ret = 0;
+ }
}
break;
case DOM0_DEL_MEMTYPE:
{
- ret = mtrr_del_page(op->u.del_memtype.reg, 0, 0);
+ if (op->u.del_memtype.handle == 0
+ /* mtrr/main.c otherwise does a lookup */
+ && (int)op->u.del_memtype.reg >= 0)
+ {
+ ret = mtrr_del_page(op->u.del_memtype.reg, 0, 0);
+ if (ret > 0)
+ ret = 0;
+ }
+ else
+ ret = -EINVAL;
}
break;
@@ -179,7 +194,7 @@ long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op)
memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4);
ret = 0;
if ( copy_to_user(u_dom0_op, op, sizeof(*op)) )
- ret = -EFAULT;
+ ret = -EFAULT;
}
break;
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index d905f9dfbf..19c29d084c 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -46,17 +46,16 @@ boolean_param("noreboot", opt_noreboot);
struct percpu_ctxt {
struct vcpu *curr_vcpu;
- unsigned int context_not_finalised;
unsigned int dirty_segment_mask;
} __cacheline_aligned;
static struct percpu_ctxt percpu_ctxt[NR_CPUS];
-static void continue_idle_task(struct vcpu *v)
+static void continue_idle_domain(struct vcpu *v)
{
reset_stack_and_jump(idle_loop);
}
-static void continue_nonidle_task(struct vcpu *v)
+static void continue_nonidle_domain(struct vcpu *v)
{
reset_stack_and_jump(ret_from_intr);
}
@@ -92,10 +91,9 @@ void startup_cpu_idle_loop(void)
{
struct vcpu *v = current;
- ASSERT(is_idle_task(v->domain));
- percpu_ctxt[smp_processor_id()].curr_vcpu = v;
- cpu_set(smp_processor_id(), v->domain->cpumask);
- v->arch.schedule_tail = continue_idle_task;
+ ASSERT(is_idle_vcpu(v));
+ cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
+ cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
reset_stack_and_jump(idle_loop);
}
@@ -217,14 +215,20 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
memset(v, 0, sizeof(*v));
- memcpy(&v->arch, &idle0_vcpu.arch, sizeof(v->arch));
+ memcpy(&v->arch, &idle_vcpu[0]->arch, sizeof(v->arch));
v->arch.flags = TF_kernel_mode;
+ if ( is_idle_domain(d) )
+ {
+ percpu_ctxt[vcpu_id].curr_vcpu = v;
+ v->arch.schedule_tail = continue_idle_domain;
+ }
+
if ( (v->vcpu_id = vcpu_id) != 0 )
{
v->arch.schedule_tail = d->vcpu[0]->arch.schedule_tail;
v->arch.perdomain_ptes =
- d->arch.mm_perdomain_pt + (vcpu_id << PDPT_VCPU_SHIFT);
+ d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
}
return v;
@@ -259,32 +263,11 @@ int arch_do_createdomain(struct vcpu *v)
int i;
#endif
- if ( is_idle_task(d) )
- return 0;
-
- d->arch.ioport_caps =
- rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
- if ( d->arch.ioport_caps == NULL )
- return -ENOMEM;
-
- if ( (d->shared_info = alloc_xenheap_page()) == NULL )
- return -ENOMEM;
-
- if ( (rc = ptwr_init(d)) != 0 )
- {
- free_xenheap_page(d->shared_info);
- return rc;
- }
-
- v->arch.schedule_tail = continue_nonidle_task;
-
- memset(d->shared_info, 0, PAGE_SIZE);
- v->vcpu_info = &d->shared_info->vcpu_info[v->vcpu_id];
- v->cpumap = CPUMAP_RUNANYWHERE;
- SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
-
pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
+ if ( d->arch.mm_perdomain_pt == NULL )
+ goto fail_nomem;
+
memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
v->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
@@ -297,49 +280,73 @@ int arch_do_createdomain(struct vcpu *v)
*/
gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
- d->arch.mm_perdomain_pt[
- (vcpuid << PDPT_VCPU_SHIFT) + FIRST_RESERVED_GDT_PAGE] = gdt_l1e;
+ d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
+ FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
v->arch.guest_vtable = __linear_l2_table;
v->arch.shadow_vtable = __shadow_linear_l2_table;
-#ifdef __x86_64__
+#if defined(__i386__)
+
+ mapcache_init(d);
+
+#else /* __x86_64__ */
+
v->arch.guest_vl3table = __linear_l3_table;
v->arch.guest_vl4table = __linear_l4_table;
d->arch.mm_perdomain_l2 = alloc_xenheap_page();
+ d->arch.mm_perdomain_l3 = alloc_xenheap_page();
+ if ( (d->arch.mm_perdomain_l2 == NULL) ||
+ (d->arch.mm_perdomain_l3 == NULL) )
+ goto fail_nomem;
+
memset(d->arch.mm_perdomain_l2, 0, PAGE_SIZE);
for ( i = 0; i < (1 << pdpt_order); i++ )
d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+i] =
l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt)+i,
__PAGE_HYPERVISOR);
- d->arch.mm_perdomain_l3 = alloc_xenheap_page();
memset(d->arch.mm_perdomain_l3, 0, PAGE_SIZE);
d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2),
__PAGE_HYPERVISOR);
-#endif
+
+#endif /* __x86_64__ */
shadow_lock_init(d);
INIT_LIST_HEAD(&d->arch.free_shadow_frames);
- return 0;
-}
+ if ( !is_idle_domain(d) )
+ {
+ d->arch.ioport_caps =
+ rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
+ if ( d->arch.ioport_caps == NULL )
+ goto fail_nomem;
-void vcpu_migrate_cpu(struct vcpu *v, int newcpu)
-{
- if ( v->processor == newcpu )
- return;
+ if ( (d->shared_info = alloc_xenheap_page()) == NULL )
+ goto fail_nomem;
- set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags);
- v->processor = newcpu;
+ if ( (rc = ptwr_init(d)) != 0 )
+ goto fail_nomem;
- if ( VMX_DOMAIN(v) )
- {
- __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
- v->arch.schedule_tail = arch_vmx_do_relaunch;
+ memset(d->shared_info, 0, PAGE_SIZE);
+ v->vcpu_info = &d->shared_info->vcpu_info[v->vcpu_id];
+ SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
+
+ v->arch.schedule_tail = continue_nonidle_domain;
}
+
+ return 0;
+
+ fail_nomem:
+ free_xenheap_page(d->shared_info);
+#ifdef __x86_64__
+ free_xenheap_page(d->arch.mm_perdomain_l2);
+ free_xenheap_page(d->arch.mm_perdomain_l3);
+#endif
+ free_xenheap_pages(d->arch.mm_perdomain_pt, pdpt_order);
+ return -ENOMEM;
}
/* This is called by arch_final_setup_guest and do_boot_vcpu */
@@ -473,14 +480,6 @@ void new_thread(struct vcpu *d,
#ifdef __x86_64__
-void toggle_guest_mode(struct vcpu *v)
-{
- v->arch.flags ^= TF_kernel_mode;
- __asm__ __volatile__ ( "swapgs" );
- update_pagetables(v);
- write_ptbase(v);
-}
-
#define loadsegment(seg,value) ({ \
int __r = 1; \
__asm__ __volatile__ ( \
@@ -650,35 +649,6 @@ static void save_segments(struct vcpu *v)
percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
}
-long do_switch_to_user(void)
-{
- struct cpu_user_regs *regs = guest_cpu_user_regs();
- struct switch_to_user stu;
- struct vcpu *v = current;
-
- if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
- unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
- return -EFAULT;
-
- toggle_guest_mode(v);
-
- regs->rip = stu.rip;
- regs->cs = stu.cs | 3; /* force guest privilege */
- regs->rflags = (stu.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
- regs->rsp = stu.rsp;
- regs->ss = stu.ss | 3; /* force guest privilege */
-
- if ( !(stu.flags & VGCF_IN_SYSCALL) )
- {
- regs->entry_vector = 0;
- regs->r11 = stu.r11;
- regs->rcx = stu.rcx;
- }
-
- /* Saved %rax gets written back to regs->rax in entry.S. */
- return stu.rax;
-}
-
#define switch_kernel_stack(_n,_c) ((void)0)
#elif defined(__i386__)
@@ -705,7 +675,10 @@ static void __context_switch(void)
struct vcpu *p = percpu_ctxt[cpu].curr_vcpu;
struct vcpu *n = current;
- if ( !is_idle_task(p->domain) )
+ ASSERT(p != n);
+ ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
+
+ if ( !is_idle_vcpu(p) )
{
memcpy(&p->arch.guest_context.user_regs,
stack_regs,
@@ -714,7 +687,7 @@ static void __context_switch(void)
save_segments(p);
}
- if ( !is_idle_task(n->domain) )
+ if ( !is_idle_vcpu(n) )
{
memcpy(stack_regs,
&n->arch.guest_context.user_regs,
@@ -740,7 +713,8 @@ static void __context_switch(void)
}
if ( p->domain != n->domain )
- cpu_set(cpu, n->domain->cpumask);
+ cpu_set(cpu, n->domain->domain_dirty_cpumask);
+ cpu_set(cpu, n->vcpu_dirty_cpumask);
write_ptbase(n);
@@ -753,7 +727,8 @@ static void __context_switch(void)
}
if ( p->domain != n->domain )
- cpu_clear(cpu, p->domain->cpumask);
+ cpu_clear(cpu, p->domain->domain_dirty_cpumask);
+ cpu_clear(cpu, p->vcpu_dirty_cpumask);
percpu_ctxt[cpu].curr_vcpu = n;
}
@@ -762,29 +737,32 @@ static void __context_switch(void)
void context_switch(struct vcpu *prev, struct vcpu *next)
{
unsigned int cpu = smp_processor_id();
+ cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
- ASSERT(!local_irq_is_enabled());
-
- set_current(next);
+ ASSERT(local_irq_is_enabled());
- if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
+ /* Allow at most one CPU at a time to be dirty. */
+ ASSERT(cpus_weight(dirty_mask) <= 1);
+ if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
{
- __context_switch();
- percpu_ctxt[cpu].context_not_finalised = 1;
+ /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
+ flush_tlb_mask(dirty_mask);
}
-}
-void context_switch_finalise(struct vcpu *next)
-{
- unsigned int cpu = smp_processor_id();
+ local_irq_disable();
- ASSERT(local_irq_is_enabled());
+ set_current(next);
- if ( percpu_ctxt[cpu].context_not_finalised )
+ if ( (percpu_ctxt[cpu].curr_vcpu == next) || is_idle_vcpu(next) )
+ {
+ local_irq_enable();
+ }
+ else
{
- percpu_ctxt[cpu].context_not_finalised = 0;
+ __context_switch();
- BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
+ /* Re-enable interrupts before restoring state which may fault. */
+ local_irq_enable();
if ( VMX_DOMAIN(next) )
{
@@ -798,6 +776,8 @@ void context_switch_finalise(struct vcpu *next)
}
}
+ context_saved(prev);
+
schedule_tail(next);
BUG();
}
@@ -827,20 +807,11 @@ int __sync_lazy_execstate(void)
void sync_vcpu_execstate(struct vcpu *v)
{
- unsigned int cpu = v->processor;
-
- if ( !cpu_isset(cpu, v->domain->cpumask) )
- return;
-
- if ( cpu == smp_processor_id() )
- {
+ if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
(void)__sync_lazy_execstate();
- }
- else
- {
- /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
- flush_tlb_mask(cpumask_of_cpu(cpu));
- }
+
+ /* Other cpus call __sync_lazy_execstate from flush ipi handler. */
+ flush_tlb_mask(v->vcpu_dirty_cpumask);
}
unsigned long __hypercall_create_continuation(
@@ -966,7 +937,7 @@ void domain_relinquish_resources(struct domain *d)
struct vcpu *v;
unsigned long pfn;
- BUG_ON(!cpus_empty(d->cpumask));
+ BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
ptwr_destroy(d);
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index d08f2c12fb..84d84a66cf 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -366,27 +366,20 @@ int construct_dom0(struct domain *d,
l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
}
- {
- unsigned long va;
- for (va = PERDOMAIN_VIRT_START; va < PERDOMAIN_VIRT_END;
- va += (1 << L2_PAGETABLE_SHIFT)) {
- l2tab[va >> L2_PAGETABLE_SHIFT] =
- l2e_from_paddr(__pa(d->arch.mm_perdomain_pt) +
- (va-PERDOMAIN_VIRT_START),
- __PAGE_HYPERVISOR);
- }
- }
v->arch.guest_table = mk_pagetable((unsigned long)l3start);
#else
l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
l2e_from_paddr((unsigned long)l2start, __PAGE_HYPERVISOR);
- l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
- l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
v->arch.guest_table = mk_pagetable((unsigned long)l2start);
#endif
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ l2tab[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
+
l2tab += l2_linear_offset(dsi.v_start);
mfn = alloc_spfn;
for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
diff --git a/xen/arch/x86/idle0_task.c b/xen/arch/x86/idle0_task.c
deleted file mode 100644
index b876c619ef..0000000000
--- a/xen/arch/x86/idle0_task.c
+++ /dev/null
@@ -1,27 +0,0 @@
-
-#include <xen/config.h>
-#include <xen/sched.h>
-#include <asm/desc.h>
-
-struct domain idle0_domain = {
- domain_id: IDLE_DOMAIN_ID,
- domain_flags:DOMF_idle_domain,
- refcnt: ATOMIC_INIT(1)
-};
-
-struct vcpu idle0_vcpu = {
- processor: 0,
- domain: &idle0_domain
-};
-
-struct tss_struct init_tss[NR_CPUS];
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index 7dd6bd590a..841bd10a03 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1807,3 +1807,47 @@ int ioapic_guest_write(int apicid, int address, u32 val)
return 0;
}
+
+void dump_ioapic_irq_info(void)
+{
+ struct irq_pin_list *entry;
+ struct IO_APIC_route_entry rte;
+ unsigned int irq, pin, printed = 0;
+ unsigned long flags;
+
+ for ( irq = 0; irq < NR_IRQS; irq++ )
+ {
+ entry = &irq_2_pin[irq];
+ if ( entry->pin == -1 )
+ continue;
+
+ if ( !printed++ )
+ printk("IO-APIC interrupt information:\n");
+
+ printk(" IRQ%3d Vec%3d:\n", irq, irq_to_vector(irq));
+
+ for ( ; ; )
+ {
+ pin = entry->pin;
+
+ printk(" Apic 0x%02x, Pin %2d: ", entry->apic, pin);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int *)&rte) + 0) = io_apic_read(entry->apic, 0x10 + 2 * pin);
+ *(((int *)&rte) + 1) = io_apic_read(entry->apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk("vector=%u, delivery_mode=%u, dest_mode=%s, "
+ "delivery_status=%d, polarity=%d, irr=%d, "
+ "trigger=%s, mask=%d\n",
+ rte.vector, rte.delivery_mode,
+ rte.dest_mode ? "logical" : "physical",
+ rte.delivery_status, rte.polarity, rte.irr,
+ rte.trigger ? "level" : "edge", rte.mask);
+
+ if ( entry->next == 0 )
+ break;
+ entry = &irq_2_pin[entry->next];
+ }
+ }
+}
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index a1aee360c3..d81d8749a6 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -12,6 +12,7 @@
#include <xen/irq.h>
#include <xen/perfc.h>
#include <xen/sched.h>
+#include <xen/keyhandler.h>
#include <asm/current.h>
#include <asm/smpboot.h>
@@ -198,15 +199,21 @@ int pirq_guest_unmask(struct domain *d)
int pirq_guest_bind(struct vcpu *v, int irq, int will_share)
{
- unsigned int vector = irq_to_vector(irq);
- irq_desc_t *desc = &irq_desc[vector];
+ unsigned int vector;
+ irq_desc_t *desc;
irq_guest_action_t *action;
unsigned long flags;
int rc = 0;
cpumask_t cpumask = CPU_MASK_NONE;
+ if ( (irq < 0) || (irq >= NR_IRQS) )
+ return -EINVAL;
+
+ vector = irq_to_vector(irq);
if ( vector == 0 )
- return -EBUSY;
+ return -EINVAL;
+
+ desc = &irq_desc[vector];
spin_lock_irqsave(&desc->lock, flags);
@@ -305,3 +312,71 @@ int pirq_guest_unbind(struct domain *d, int irq)
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
+
+extern void dump_ioapic_irq_info(void);
+
+static void dump_irqs(unsigned char key)
+{
+ int i, irq, vector;
+ irq_desc_t *desc;
+ irq_guest_action_t *action;
+ struct domain *d;
+ unsigned long flags;
+
+ printk("Guest interrupt information:\n");
+
+ for ( irq = 0; irq < NR_IRQS; irq++ )
+ {
+ vector = irq_to_vector(irq);
+ if ( vector == 0 )
+ continue;
+
+ desc = &irq_desc[vector];
+
+ spin_lock_irqsave(&desc->lock, flags);
+
+ if ( desc->status & IRQ_GUEST )
+ {
+ action = (irq_guest_action_t *)desc->action;
+
+ printk(" IRQ%3d Vec%3d: type=%-15s status=%08x "
+ "in-flight=%d domain-list=",
+ irq, vector, desc->handler->typename,
+ desc->status, action->in_flight);
+
+ for ( i = 0; i < action->nr_guests; i++ )
+ {
+ d = action->guest[i];
+ printk("%u(%c%c%c%c)",
+ d->domain_id,
+ (test_bit(d->pirq_to_evtchn[irq],
+ &d->shared_info->evtchn_pending[0]) ?
+ 'P' : '-'),
+ (test_bit(d->pirq_to_evtchn[irq]/BITS_PER_LONG,
+ &d->shared_info->vcpu_info[0].
+ evtchn_pending_sel) ?
+ 'S' : '-'),
+ (test_bit(d->pirq_to_evtchn[irq],
+ &d->shared_info->evtchn_mask[0]) ?
+ 'M' : '-'),
+ (test_bit(irq, &d->pirq_mask) ?
+ 'M' : '-'));
+ if ( i != action->nr_guests )
+ printk(",");
+ }
+
+ printk("\n");
+ }
+
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+
+ dump_ioapic_irq_info();
+}
+
+static int __init setup_dump_irqs(void)
+{
+ register_keyhandler('i', dump_irqs, "dump interrupt bindings");
+ return 0;
+}
+__initcall(setup_dump_irqs);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 683c4b7534..79da37d3ea 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -297,7 +297,6 @@ int map_ldt_shadow_page(unsigned int off)
#if defined(__x86_64__)
/* If in user mode, switch to kernel mode just to read LDT mapping. */
- extern void toggle_guest_mode(struct vcpu *);
int user_mode = !(v->arch.flags & TF_kernel_mode);
#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
#elif defined(__i386__)
@@ -841,10 +840,11 @@ static int alloc_l2_table(struct pfn_info *page, unsigned long type)
L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
l2e_from_pfn(pfn, __PAGE_HYPERVISOR);
- pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
- l2e_from_page(
- virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt),
- __PAGE_HYPERVISOR);
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(
+ virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
#endif
unmap_domain_page(pl2e);
@@ -1457,7 +1457,8 @@ int get_page_type(struct pfn_info *page, unsigned long type)
* was GDT/LDT) but those circumstances should be
* very rare.
*/
- cpumask_t mask = page_get_owner(page)->cpumask;
+ cpumask_t mask =
+ page_get_owner(page)->domain_dirty_cpumask;
tlbflush_filter(mask, page->tlbflush_timestamp);
if ( unlikely(!cpus_empty(mask)) )
@@ -1619,7 +1620,7 @@ static void process_deferred_ops(unsigned int cpu)
if ( shadow_mode_enabled(d) )
shadow_sync_all(d);
if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
- flush_tlb_mask(d->cpumask);
+ flush_tlb_mask(d->domain_dirty_cpumask);
else
local_flush_tlb();
}
@@ -1691,7 +1692,7 @@ static inline cpumask_t vcpumask_to_pcpumask(
struct domain *d, unsigned long vmask)
{
unsigned int vcpu_id;
- cpumask_t pmask;
+ cpumask_t pmask = CPU_MASK_NONE;
struct vcpu *v;
while ( vmask != 0 )
@@ -1700,7 +1701,7 @@ static inline cpumask_t vcpumask_to_pcpumask(
vmask &= ~(1UL << vcpu_id);
if ( (vcpu_id < MAX_VIRT_CPUS) &&
((v = d->vcpu[vcpu_id]) != NULL) )
- cpu_set(v->processor, pmask);
+ cpus_or(pmask, pmask, v->vcpu_dirty_cpumask);
}
return pmask;
@@ -1869,7 +1870,6 @@ int do_mmuext_op(
break;
}
pmask = vcpumask_to_pcpumask(d, vmask);
- cpus_and(pmask, pmask, d->cpumask);
if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
flush_tlb_mask(pmask);
else
@@ -1878,11 +1878,11 @@ int do_mmuext_op(
}
case MMUEXT_TLB_FLUSH_ALL:
- flush_tlb_mask(d->cpumask);
+ flush_tlb_mask(d->domain_dirty_cpumask);
break;
case MMUEXT_INVLPG_ALL:
- flush_tlb_one_mask(d->cpumask, op.arg1.linear_addr);
+ flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
break;
case MMUEXT_FLUSH_CACHE:
@@ -2497,7 +2497,7 @@ int do_update_va_mapping(unsigned long va, u64 val64,
l1_pgentry_t val = l1e_from_intpte(val64);
struct vcpu *v = current;
struct domain *d = v->domain;
- unsigned int cpu = v->processor;
+ unsigned int cpu = smp_processor_id();
unsigned long vmask, bmap_ptr;
cpumask_t pmask;
int rc = 0;
@@ -2548,13 +2548,12 @@ int do_update_va_mapping(unsigned long va, u64 val64,
local_flush_tlb();
break;
case UVMF_ALL:
- flush_tlb_mask(d->cpumask);
+ flush_tlb_mask(d->domain_dirty_cpumask);
break;
default:
if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
rc = -EFAULT;
pmask = vcpumask_to_pcpumask(d, vmask);
- cpus_and(pmask, pmask, d->cpumask);
flush_tlb_mask(pmask);
break;
}
@@ -2569,13 +2568,12 @@ int do_update_va_mapping(unsigned long va, u64 val64,
local_flush_tlb_one(va);
break;
case UVMF_ALL:
- flush_tlb_one_mask(d->cpumask, va);
+ flush_tlb_one_mask(d->domain_dirty_cpumask, va);
break;
default:
if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) )
rc = -EFAULT;
pmask = vcpumask_to_pcpumask(d, vmask);
- cpus_and(pmask, pmask, d->cpumask);
flush_tlb_one_mask(pmask, va);
break;
}
@@ -2972,7 +2970,6 @@ void ptwr_flush(struct domain *d, const int which)
#ifdef CONFIG_X86_64
struct vcpu *v = current;
- extern void toggle_guest_mode(struct vcpu *);
int user_mode = !(v->arch.flags & TF_kernel_mode);
#endif
@@ -3002,7 +2999,7 @@ void ptwr_flush(struct domain *d, const int which)
BUG();
}
PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n",
- PTWR_PRINT_WHICH, ptep, pte.l1);
+ PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
l1e_remove_flags(pte, _PAGE_RW);
/* Write-protect the p.t. page in the guest page table. */
@@ -3018,20 +3015,33 @@ void ptwr_flush(struct domain *d, const int which)
/* Ensure that there are no stale writable mappings in any TLB. */
/* NB. INVLPG is a serialising instruction: flushes pending updates. */
- flush_tlb_one_mask(d->cpumask, l1va);
+ flush_tlb_one_mask(d->domain_dirty_cpumask, l1va);
PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n",
- PTWR_PRINT_WHICH, ptep, pte.l1);
+ PTWR_PRINT_WHICH, ptep, l1e_get_intpte(pte));
/*
* STEP 2. Validate any modified PTEs.
*/
- pl1e = d->arch.ptwr[which].pl1e;
- modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
- unmap_domain_page(pl1e);
- perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
- ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
- d->arch.ptwr[which].prev_nr_updates = modified;
+ if ( likely(d == current->domain) )
+ {
+ pl1e = map_domain_page(l1e_get_pfn(pte));
+ modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
+ unmap_domain_page(pl1e);
+ perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
+ ptwr_eip_stat_update(d->arch.ptwr[which].eip, d->domain_id, modified);
+ d->arch.ptwr[which].prev_nr_updates = modified;
+ }
+ else
+ {
+ /*
+ * Must make a temporary global mapping, since we are running in the
+ * wrong address space, so no access to our own mapcache.
+ */
+ pl1e = map_domain_page_global(l1e_get_pfn(pte));
+ modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
+ unmap_domain_page_global(pl1e);
+ }
/*
* STEP 3. Reattach the L1 p.t. page into the current address space.
@@ -3209,7 +3219,7 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr,
{
unsigned long pfn;
struct pfn_info *page;
- l1_pgentry_t pte;
+ l1_pgentry_t *pl1e, pte;
l2_pgentry_t *pl2e, l2e;
int which, flags;
unsigned long l2_idx;
@@ -3342,15 +3352,14 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr,
if ( which == PTWR_PT_ACTIVE )
{
l2e_remove_flags(*pl2e, _PAGE_PRESENT);
- flush_tlb_mask(d->cpumask);
+ flush_tlb_mask(d->domain_dirty_cpumask);
}
/* Temporarily map the L1 page, and make a copy of it. */
- d->arch.ptwr[which].pl1e = map_domain_page(pfn);
- memcpy(d->arch.ptwr[which].page,
- d->arch.ptwr[which].pl1e,
- L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
-
+ pl1e = map_domain_page(pfn);
+ memcpy(d->arch.ptwr[which].page, pl1e, PAGE_SIZE);
+ unmap_domain_page(pl1e);
+
/* Finally, make the p.t. page writable by the guest OS. */
l1e_add_flags(pte, _PAGE_RW);
if ( unlikely(__put_user(pte.l1,
@@ -3359,7 +3368,6 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr,
MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
&linear_pg_table[l1_linear_offset(addr)]);
/* Toss the writable pagetable state and crash. */
- unmap_domain_page(d->arch.ptwr[which].pl1e);
d->arch.ptwr[which].l1va = 0;
domain_crash(d);
return 0;
@@ -3369,7 +3377,7 @@ int ptwr_do_page_fault(struct domain *d, unsigned long addr,
emulate:
if ( x86_emulate_memop(guest_cpu_user_regs(), addr,
- &ptwr_mem_emulator, BITS_PER_LONG/8) )
+ &ptwr_mem_emulator, X86EMUL_MODE_HOST) )
return 0;
perfc_incrc(ptwr_emulations);
return EXCRET_fault_fixed;
diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c
index b63036ac54..96c55572cd 100644
--- a/xen/arch/x86/nmi.c
+++ b/xen/arch/x86/nmi.c
@@ -23,18 +23,20 @@
#include <xen/sched.h>
#include <xen/console.h>
#include <xen/smp.h>
+#include <xen/keyhandler.h>
#include <asm/current.h>
#include <asm/mc146818rtc.h>
#include <asm/msr.h>
#include <asm/mpspec.h>
#include <asm/debugger.h>
#include <asm/div64.h>
+#include <asm/apic.h>
unsigned int nmi_watchdog = NMI_NONE;
static unsigned int nmi_hz = HZ;
static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
static unsigned int nmi_p4_cccr_val;
-static struct ac_timer nmi_timer[NR_CPUS];
+static struct timer nmi_timer[NR_CPUS];
static unsigned int nmi_timer_ticks[NR_CPUS];
/*
@@ -132,7 +134,7 @@ static void nmi_timer_fn(void *unused)
{
int cpu = smp_processor_id();
nmi_timer_ticks[cpu]++;
- set_ac_timer(&nmi_timer[cpu], NOW() + MILLISECS(1000));
+ set_timer(&nmi_timer[cpu], NOW() + MILLISECS(1000));
}
static void disable_lapic_nmi_watchdog(void)
@@ -308,8 +310,6 @@ static int __pminit setup_p4_watchdog(void)
void __pminit setup_apic_nmi_watchdog(void)
{
- int cpu = smp_processor_id();
-
if (!nmi_watchdog)
return;
@@ -344,49 +344,37 @@ void __pminit setup_apic_nmi_watchdog(void)
lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
nmi_active = 1;
-
- init_ac_timer(&nmi_timer[cpu], nmi_timer_fn, NULL, cpu);
}
static unsigned int
last_irq_sums [NR_CPUS],
alert_counter [NR_CPUS];
-static spinlock_t watchdog_lock = SPIN_LOCK_UNLOCKED;
-static unsigned int watchdog_disable_count = 1;
-static unsigned int watchdog_on;
+static atomic_t watchdog_disable_count = ATOMIC_INIT(1);
void watchdog_disable(void)
{
- unsigned long flags;
-
- spin_lock_irqsave(&watchdog_lock, flags);
-
- if ( watchdog_disable_count++ == 0 )
- watchdog_on = 0;
-
- spin_unlock_irqrestore(&watchdog_lock, flags);
+ atomic_inc(&watchdog_disable_count);
}
void watchdog_enable(void)
{
- unsigned int cpu;
- unsigned long flags;
+ static unsigned long heartbeat_initialised;
+ unsigned int cpu;
- spin_lock_irqsave(&watchdog_lock, flags);
+ if ( !atomic_dec_and_test(&watchdog_disable_count) ||
+ test_and_set_bit(0, &heartbeat_initialised) )
+ return;
- if ( --watchdog_disable_count == 0 )
+ /*
+ * Activate periodic heartbeats. We cannot do this earlier during
+ * setup because the timer infrastructure is not available.
+ */
+ for_each_online_cpu ( cpu )
{
- watchdog_on = 1;
- /*
- * Ensure periodic heartbeats are active. We cannot do this earlier
- * during setup because the timer infrastructure is not available.
- */
- for_each_online_cpu ( cpu )
- set_ac_timer(&nmi_timer[cpu], NOW());
+ init_timer(&nmi_timer[cpu], nmi_timer_fn, NULL, cpu);
+ set_timer(&nmi_timer[cpu], NOW());
}
-
- spin_unlock_irqrestore(&watchdog_lock, flags);
}
void nmi_watchdog_tick(struct cpu_user_regs * regs)
@@ -395,7 +383,7 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs)
sum = nmi_timer_ticks[cpu];
- if ( (last_irq_sums[cpu] == sum) && watchdog_on )
+ if ( (last_irq_sums[cpu] == sum) && !atomic_read(&watchdog_disable_count) )
{
/*
* Ayiee, looks like this CPU is stuck ... wait a few IRQs (5 seconds)
@@ -440,3 +428,29 @@ void nmi_watchdog_tick(struct cpu_user_regs * regs)
write_watchdog_counter(NULL);
}
}
+
+/*
+ * For some reason the destination shorthand for self is not valid
+ * when used with the NMI delivery mode. This is documented in Tables
+ * 8-3 and 8-4 in IA32 Reference Manual Volume 3. We send the IPI to
+ * our own APIC ID explicitly which is valid.
+ */
+static void do_nmi_trigger(unsigned char key)
+{
+ u32 id = apic_read(APIC_ID);
+
+ printk("Triggering NMI on APIC ID %x\n", id);
+
+ local_irq_disable();
+ apic_wait_icr_idle();
+ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_INT_ASSERT);
+ local_irq_enable();
+}
+
+static __init int register_nmi_trigger(void)
+{
+ register_keyhandler('n', do_nmi_trigger, "trigger an NMI");
+ return 0;
+}
+__initcall(register_nmi_trigger);
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index f27806f8f6..39bf4a523d 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -81,6 +81,10 @@ extern void early_time_init(void);
extern void initialize_keytable(void);
extern void early_cpu_init(void);
+struct tss_struct init_tss[NR_CPUS];
+
+struct vcpu *idle_vcpu[NR_CPUS];
+
extern unsigned long cpu0_stack[];
struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -92,8 +96,6 @@ unsigned long mmu_cr4_features = X86_CR4_PSE;
#endif
EXPORT_SYMBOL(mmu_cr4_features);
-struct vcpu *idle_task[NR_CPUS] = { &idle0_vcpu };
-
int acpi_disabled;
int acpi_force;
@@ -144,8 +146,8 @@ static struct e820entry e820_raw[E820MAX];
void __init __start_xen(multiboot_info_t *mbi)
{
- unsigned long vgdt, gdt_pfn;
char *cmdline;
+ struct domain *idle_domain;
unsigned long _initrd_start = 0, _initrd_len = 0;
unsigned int initrdidx = 1;
module_t *mod = (module_t *)__va(mbi->mods_addr);
@@ -163,9 +165,8 @@ void __init __start_xen(multiboot_info_t *mbi)
if ( (mbi->flags & MBI_CMDLINE) && (mbi->cmdline != 0) )
cmdline_parse(__va(mbi->cmdline));
- /* Must do this early -- e.g., spinlocks rely on get_current(). */
- set_current(&idle0_vcpu);
- set_processor_id(0);
+ set_current((struct vcpu *)0xfffff000); /* debug sanity */
+ set_processor_id(0); /* needed early, for smp_processor_id() */
smp_prepare_boot_cpu();
@@ -343,6 +344,12 @@ void __init __start_xen(multiboot_info_t *mbi)
BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
BUG_ON(sizeof(vcpu_info_t) != 64);
+ /* __foo are defined in public headers. Check they match internal defs. */
+ BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
+#ifdef HYPERVISOR_VIRT_END
+ BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
+#endif
+
init_frametable();
end_boot_allocator();
@@ -376,6 +383,14 @@ void __init __start_xen(multiboot_info_t *mbi)
early_cpu_init();
+ scheduler_init();
+
+ idle_domain = do_createdomain(IDLE_DOMAIN_ID, 0);
+ BUG_ON(idle_domain == NULL);
+
+ set_current(idle_domain->vcpu[0]);
+ idle_vcpu[0] = current;
+
paging_init();
/* Unmap the first page of CPU0's stack. */
@@ -388,21 +403,6 @@ void __init __start_xen(multiboot_info_t *mbi)
sort_exception_tables();
- if ( arch_do_createdomain(current) != 0 )
- BUG();
-
- /*
- * Map default GDT into its final positions in the idle page table. As
- * noted in arch_do_createdomain(), we must map for every possible VCPU#.
- */
- vgdt = GDT_VIRT_START(current) + FIRST_RESERVED_GDT_BYTE;
- gdt_pfn = virt_to_phys(gdt_table) >> PAGE_SHIFT;
- for ( i = 0; i < MAX_VIRT_CPUS; i++ )
- {
- map_pages_to_xen(vgdt, gdt_pfn, 1, PAGE_HYPERVISOR);
- vgdt += 1 << PDPT_VCPU_VA_SHIFT;
- }
-
find_smp_config();
smp_alloc_memory();
@@ -423,14 +423,12 @@ void __init __start_xen(multiboot_info_t *mbi)
trap_init();
- ac_timer_init();
+ timer_init();
early_time_init();
arch_init_memory();
- scheduler_init();
-
identify_cpu(&boot_cpu_data);
if ( cpu_has_fxsr )
set_in_cr4(X86_CR4_OSFXSR);
@@ -480,7 +478,8 @@ void __init __start_xen(multiboot_info_t *mbi)
schedulers_start();
- watchdog_enable();
+ if ( opt_watchdog )
+ watchdog_enable();
shadow_mode_init();
diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c
index 41b76842fd..b2fd143452 100644
--- a/xen/arch/x86/shadow.c
+++ b/xen/arch/x86/shadow.c
@@ -469,6 +469,7 @@ static unsigned long shadow_l2_table(
{
unsigned long smfn;
l2_pgentry_t *spl2e;
+ int i;
SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
@@ -503,9 +504,11 @@ static unsigned long shadow_l2_table(
spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
- spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
- l2e_from_paddr(__pa(page_get_owner(pfn_to_page(gmfn))->arch.mm_perdomain_pt),
- __PAGE_HYPERVISOR);
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(virt_to_page(page_get_owner(pfn_to_page(gmfn))->
+ arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
if ( shadow_mode_translate(d) ) // NB: not external
{
@@ -1800,7 +1803,7 @@ static void sync_all(struct domain *d)
}
/* Other VCPUs mustn't use the revoked writable mappings. */
- other_vcpus_mask = d->cpumask;
+ other_vcpus_mask = d->domain_dirty_cpumask;
cpu_clear(smp_processor_id(), other_vcpus_mask);
flush_tlb_mask(other_vcpus_mask);
@@ -2150,8 +2153,8 @@ static void shadow_update_pagetables(struct vcpu *v)
if ( max_mode & (SHM_enable | SHM_external) )
{
if ( likely(v->arch.guest_vtable != NULL) )
- unmap_domain_page(v->arch.guest_vtable);
- v->arch.guest_vtable = map_domain_page(gmfn);
+ unmap_domain_page_global(v->arch.guest_vtable);
+ v->arch.guest_vtable = map_domain_page_global(gmfn);
}
/*
@@ -2187,8 +2190,8 @@ static void shadow_update_pagetables(struct vcpu *v)
)
{
if ( v->arch.shadow_vtable )
- unmap_domain_page(v->arch.shadow_vtable);
- v->arch.shadow_vtable = map_domain_page(smfn);
+ unmap_domain_page_global(v->arch.shadow_vtable);
+ v->arch.shadow_vtable = map_domain_page_global(smfn);
}
#if CONFIG_PAGING_LEVELS == 2
@@ -2204,8 +2207,8 @@ static void shadow_update_pagetables(struct vcpu *v)
if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
if ( v->arch.hl2_vtable )
- unmap_domain_page(v->arch.hl2_vtable);
- v->arch.hl2_vtable = map_domain_page(hl2mfn);
+ unmap_domain_page_global(v->arch.hl2_vtable);
+ v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
}
/*
diff --git a/xen/arch/x86/shadow32.c b/xen/arch/x86/shadow32.c
index 872c73f545..eb09ea92c5 100644
--- a/xen/arch/x86/shadow32.c
+++ b/xen/arch/x86/shadow32.c
@@ -726,6 +726,7 @@ static void alloc_monitor_pagetable(struct vcpu *v)
l2_pgentry_t *mpl2e;
struct pfn_info *mmfn_info;
struct domain *d = v->domain;
+ int i;
ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
@@ -733,16 +734,17 @@ static void alloc_monitor_pagetable(struct vcpu *v)
ASSERT(mmfn_info != NULL);
mmfn = page_to_pfn(mmfn_info);
- mpl2e = (l2_pgentry_t *)map_domain_page(mmfn);
+ mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
memset(mpl2e, 0, PAGE_SIZE);
memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
&idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
- mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
- l2e_from_paddr(__pa(d->arch.mm_perdomain_pt),
- __PAGE_HYPERVISOR);
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
// map the phys_to_machine map into the Read-Only MPT space for this domain
mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
@@ -794,7 +796,7 @@ void free_monitor_pagetable(struct vcpu *v)
* Then free monitor_table.
*/
mfn = pagetable_get_pfn(v->arch.monitor_table);
- unmap_domain_page(v->arch.monitor_vtable);
+ unmap_domain_page_global(v->arch.monitor_vtable);
free_domheap_page(pfn_to_page(mfn));
v->arch.monitor_table = mk_pagetable(0);
@@ -929,7 +931,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
if ( v->arch.guest_vtable &&
(v->arch.guest_vtable != __linear_l2_table) )
{
- unmap_domain_page(v->arch.guest_vtable);
+ unmap_domain_page_global(v->arch.guest_vtable);
}
if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
v->arch.guest_vtable = __linear_l2_table;
@@ -942,7 +944,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
if ( v->arch.shadow_vtable &&
(v->arch.shadow_vtable != __shadow_linear_l2_table) )
{
- unmap_domain_page(v->arch.shadow_vtable);
+ unmap_domain_page_global(v->arch.shadow_vtable);
}
if ( !(mode & SHM_external) )
v->arch.shadow_vtable = __shadow_linear_l2_table;
@@ -955,7 +957,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
if ( v->arch.hl2_vtable &&
(v->arch.hl2_vtable != __linear_hl2_table) )
{
- unmap_domain_page(v->arch.hl2_vtable);
+ unmap_domain_page_global(v->arch.hl2_vtable);
}
if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
v->arch.hl2_vtable = __linear_hl2_table;
@@ -1508,6 +1510,7 @@ static unsigned long shadow_l2_table(
{
unsigned long smfn;
l2_pgentry_t *spl2e;
+ int i;
SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
@@ -1542,9 +1545,11 @@ static unsigned long shadow_l2_table(
spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
- spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
- l2e_from_paddr(__pa(page_get_owner(pfn_to_page(gmfn))->arch.mm_perdomain_pt),
- __PAGE_HYPERVISOR);
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(virt_to_page(page_get_owner(pfn_to_page(gmfn))->
+ arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
if ( shadow_mode_translate(d) ) // NB: not external
{
@@ -2586,7 +2591,7 @@ void __shadow_sync_all(struct domain *d)
}
/* Other VCPUs mustn't use the revoked writable mappings. */
- other_vcpus_mask = d->cpumask;
+ other_vcpus_mask = d->domain_dirty_cpumask;
cpu_clear(smp_processor_id(), other_vcpus_mask);
flush_tlb_mask(other_vcpus_mask);
@@ -2906,8 +2911,8 @@ void __update_pagetables(struct vcpu *v)
if ( max_mode & (SHM_enable | SHM_external) )
{
if ( likely(v->arch.guest_vtable != NULL) )
- unmap_domain_page(v->arch.guest_vtable);
- v->arch.guest_vtable = map_domain_page(gmfn);
+ unmap_domain_page_global(v->arch.guest_vtable);
+ v->arch.guest_vtable = map_domain_page_global(gmfn);
}
/*
@@ -2932,8 +2937,8 @@ void __update_pagetables(struct vcpu *v)
if ( max_mode == SHM_external )
{
if ( v->arch.shadow_vtable )
- unmap_domain_page(v->arch.shadow_vtable);
- v->arch.shadow_vtable = map_domain_page(smfn);
+ unmap_domain_page_global(v->arch.shadow_vtable);
+ v->arch.shadow_vtable = map_domain_page_global(smfn);
}
/*
@@ -2948,8 +2953,8 @@ void __update_pagetables(struct vcpu *v)
if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
if ( v->arch.hl2_vtable )
- unmap_domain_page(v->arch.hl2_vtable);
- v->arch.hl2_vtable = map_domain_page(hl2mfn);
+ unmap_domain_page_global(v->arch.hl2_vtable);
+ v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
}
/*
diff --git a/xen/arch/x86/shadow_public.c b/xen/arch/x86/shadow_public.c
index 931a31f83f..bb376bb737 100644
--- a/xen/arch/x86/shadow_public.c
+++ b/xen/arch/x86/shadow_public.c
@@ -151,6 +151,8 @@ free_shadow_fl1_table(struct domain *d, unsigned long smfn)
for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
put_page_from_l1e(pl1e[i], d);
+
+ unmap_domain_page(pl1e);
}
/*
@@ -254,6 +256,7 @@ static pagetable_t page_table_convert(struct domain *d)
pae_l3 = map_domain_page(pagetable_get_pfn(d->arch.phys_table));
for (i = 0; i < PDP_ENTRIES; i++)
l3[i] = l3e_from_pfn(l3e_get_pfn(pae_l3[i]), __PAGE_HYPERVISOR);
+ unmap_domain_page(pae_l3);
unmap_domain_page(l4);
unmap_domain_page(l3);
@@ -275,7 +278,7 @@ static void alloc_monitor_pagetable(struct vcpu *v)
ASSERT( mmfn_info );
mmfn = page_to_pfn(mmfn_info);
- mpl4e = (l4_pgentry_t *) map_domain_page(mmfn);
+ mpl4e = (l4_pgentry_t *) map_domain_page_global(mmfn);
memcpy(mpl4e, &idle_pg_table[0], PAGE_SIZE);
mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
@@ -298,7 +301,7 @@ void free_monitor_pagetable(struct vcpu *v)
* free monitor_table.
*/
mfn = pagetable_get_pfn(v->arch.monitor_table);
- unmap_domain_page(v->arch.monitor_vtable);
+ unmap_domain_page_global(v->arch.monitor_vtable);
free_domheap_page(pfn_to_page(mfn));
v->arch.monitor_table = mk_pagetable(0);
@@ -325,6 +328,7 @@ static void alloc_monitor_pagetable(struct vcpu *v)
l2_pgentry_t *mpl2e;
struct pfn_info *mmfn_info;
struct domain *d = v->domain;
+ int i;
ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
@@ -332,16 +336,17 @@ static void alloc_monitor_pagetable(struct vcpu *v)
ASSERT(mmfn_info != NULL);
mmfn = page_to_pfn(mmfn_info);
- mpl2e = (l2_pgentry_t *)map_domain_page(mmfn);
+ mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
memset(mpl2e, 0, PAGE_SIZE);
memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
&idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
- mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
- l2e_from_paddr(__pa(d->arch.mm_perdomain_pt),
- __PAGE_HYPERVISOR);
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
// map the phys_to_machine map into the Read-Only MPT space for this domain
mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
@@ -393,7 +398,7 @@ void free_monitor_pagetable(struct vcpu *v)
* Then free monitor_table.
*/
mfn = pagetable_get_pfn(v->arch.monitor_table);
- unmap_domain_page(v->arch.monitor_vtable);
+ unmap_domain_page_global(v->arch.monitor_vtable);
free_domheap_page(pfn_to_page(mfn));
v->arch.monitor_table = mk_pagetable(0);
@@ -977,7 +982,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
if ( v->arch.guest_vtable &&
(v->arch.guest_vtable != __linear_l2_table) )
{
- unmap_domain_page(v->arch.guest_vtable);
+ unmap_domain_page_global(v->arch.guest_vtable);
}
if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
v->arch.guest_vtable = __linear_l2_table;
@@ -990,7 +995,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
if ( v->arch.shadow_vtable &&
(v->arch.shadow_vtable != __shadow_linear_l2_table) )
{
- unmap_domain_page(v->arch.shadow_vtable);
+ unmap_domain_page_global(v->arch.shadow_vtable);
}
if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2)
v->arch.shadow_vtable = __shadow_linear_l2_table;
@@ -1004,7 +1009,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
if ( v->arch.hl2_vtable &&
(v->arch.hl2_vtable != __linear_hl2_table) )
{
- unmap_domain_page(v->arch.hl2_vtable);
+ unmap_domain_page_global(v->arch.hl2_vtable);
}
if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
v->arch.hl2_vtable = __linear_hl2_table;
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 30ca4864b2..b3cc714bcd 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -435,7 +435,7 @@ void __init start_secondary(void *unused)
extern void percpu_traps_init(void);
- set_current(idle_task[cpu]);
+ set_current(idle_vcpu[cpu]);
set_processor_id(cpu);
percpu_traps_init();
@@ -761,7 +761,6 @@ static int __init do_boot_cpu(int apicid)
* Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
*/
{
- struct domain *idle;
struct vcpu *v;
unsigned long boot_error;
int timeout, cpu;
@@ -770,14 +769,10 @@ static int __init do_boot_cpu(int apicid)
cpu = ++cpucount;
- if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL )
- panic("failed 'createdomain' for CPU %d", cpu);
+ v = idle_vcpu[cpu] = alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
+ BUG_ON(v == NULL);
- v = idle_task[cpu] = idle->vcpu[0];
-
- set_bit(_DOMF_idle_domain, &idle->domain_flags);
-
- v->arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
+ v->arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
/* start_eip had better be page-aligned! */
start_eip = setup_trampoline();
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 7e7c40fca1..1bd15c6702 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -17,7 +17,7 @@
#include <xen/config.h>
#include <xen/init.h>
#include <xen/time.h>
-#include <xen/ac_timer.h>
+#include <xen/timer.h>
#include <xen/smp.h>
#include <xen/irq.h>
#include <xen/softirq.h>
@@ -56,7 +56,7 @@ struct cpu_time {
s_time_t stime_local_stamp;
s_time_t stime_master_stamp;
struct time_scale tsc_scale;
- struct ac_timer calibration_timer;
+ struct timer calibration_timer;
} __cacheline_aligned;
static struct cpu_time cpu_time[NR_CPUS];
@@ -163,7 +163,7 @@ void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
/* Rough hack to allow accurate timers to sort-of-work with no APIC. */
if ( !cpu_has_apic )
- raise_softirq(AC_TIMER_SOFTIRQ);
+ raise_softirq(TIMER_SOFTIRQ);
if ( using_pit )
pit_overflow();
@@ -342,7 +342,7 @@ static void init_pit(void)
/* Protected by platform_timer_lock. */
static u64 hpet_counter64, hpet_overflow_period;
static u32 hpet_stamp;
-static struct ac_timer hpet_overflow_timer;
+static struct timer hpet_overflow_timer;
static void hpet_overflow(void *unused)
{
@@ -354,7 +354,7 @@ static void hpet_overflow(void *unused)
hpet_stamp = counter;
spin_unlock_irq(&platform_timer_lock);
- set_ac_timer(&hpet_overflow_timer, NOW() + hpet_overflow_period);
+ set_timer(&hpet_overflow_timer, NOW() + hpet_overflow_period);
}
static u64 read_hpet_count(void)
@@ -430,7 +430,7 @@ static int init_hpet(void)
(void)do_div(hpet_overflow_period, (u32)hpet_rate);
}
- init_ac_timer(&hpet_overflow_timer, hpet_overflow, NULL, 0);
+ init_timer(&hpet_overflow_timer, hpet_overflow, NULL, 0);
hpet_overflow(NULL);
platform_timer_stamp = hpet_counter64;
@@ -459,7 +459,7 @@ int use_cyclone;
/* Protected by platform_timer_lock. */
static u64 cyclone_counter64;
static u32 cyclone_stamp;
-static struct ac_timer cyclone_overflow_timer;
+static struct timer cyclone_overflow_timer;
static volatile u32 *cyclone_timer; /* Cyclone MPMC0 register */
static void cyclone_overflow(void *unused)
@@ -472,7 +472,7 @@ static void cyclone_overflow(void *unused)
cyclone_stamp = counter;
spin_unlock_irq(&platform_timer_lock);
- set_ac_timer(&cyclone_overflow_timer, NOW() + MILLISECS(20000));
+ set_timer(&cyclone_overflow_timer, NOW() + MILLISECS(20000));
}
static u64 read_cyclone_count(void)
@@ -510,7 +510,7 @@ static int init_cyclone(void)
read_platform_count = read_cyclone_count;
- init_ac_timer(&cyclone_overflow_timer, cyclone_overflow, NULL, 0);
+ init_timer(&cyclone_overflow_timer, cyclone_overflow, NULL, 0);
cyclone_overflow(NULL);
platform_timer_stamp = cyclone_counter64;
set_time_scale(&platform_timer_scale, CYCLONE_TIMER_FREQ);
@@ -876,7 +876,7 @@ static void local_time_calibration(void *unused)
cpu_time[cpu].stime_master_stamp = curr_master_stime;
out:
- set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH);
+ set_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH);
if ( cpu == 0 )
platform_time_calibration();
@@ -896,9 +896,9 @@ void init_percpu_time(void)
cpu_time[cpu].stime_master_stamp = now;
cpu_time[cpu].stime_local_stamp = now;
- init_ac_timer(&cpu_time[cpu].calibration_timer,
+ init_timer(&cpu_time[cpu].calibration_timer,
local_time_calibration, NULL, cpu);
- set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH);
+ set_timer(&cpu_time[cpu].calibration_timer, NOW() + EPOCH);
}
/* Late init function (after all CPUs are booted). */
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index a4be3db3b3..0a7280fb70 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -130,9 +130,19 @@ unsigned long kernel_text_end(void)
static void show_guest_stack(struct cpu_user_regs *regs)
{
int i;
- unsigned long *stack = (unsigned long *)regs->esp, addr;
+ unsigned long *stack, addr;
- printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
+ if ( VM86_MODE(regs) )
+ {
+ stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff));
+ printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ",
+ regs->ss, (uint16_t)(regs->esp & 0xffff));
+ }
+ else
+ {
+ stack = (unsigned long *)regs->esp;
+ printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
+ }
for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
{
@@ -427,7 +437,7 @@ void propagate_page_fault(unsigned long addr, u16 error_code)
tb->flags |= TBF_INTERRUPT;
}
-static int handle_perdomain_mapping_fault(
+static int handle_gdt_ldt_mapping_fault(
unsigned long offset, struct cpu_user_regs *regs)
{
extern int map_ldt_shadow_page(unsigned int);
@@ -437,14 +447,14 @@ static int handle_perdomain_mapping_fault(
int ret;
/* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
- unsigned int is_ldt_area = (offset >> (PDPT_VCPU_VA_SHIFT-1)) & 1;
- unsigned int vcpu_area = (offset >> PDPT_VCPU_VA_SHIFT);
+ unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
+ unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
/* Should never fault in another vcpu's area. */
BUG_ON(vcpu_area != current->vcpu_id);
/* Byte offset within the gdt/ldt sub-area. */
- offset &= (1UL << (PDPT_VCPU_VA_SHIFT-1)) - 1UL;
+ offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
if ( likely(is_ldt_area) )
{
@@ -490,9 +500,9 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
{
if ( shadow_mode_external(d) && GUEST_CONTEXT(v, regs) )
return shadow_fault(addr, regs);
- if ( (addr >= PERDOMAIN_VIRT_START) && (addr < PERDOMAIN_VIRT_END) )
- return handle_perdomain_mapping_fault(
- addr - PERDOMAIN_VIRT_START, regs);
+ if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
+ return handle_gdt_ldt_mapping_fault(
+ addr - GDT_LDT_VIRT_START, regs);
}
else if ( unlikely(shadow_mode_enabled(d)) )
{
@@ -596,7 +606,6 @@ static inline int guest_io_okay(
u16 x;
#if defined(__x86_64__)
/* If in user mode, switch to kernel mode just to read I/O bitmap. */
- extern void toggle_guest_mode(struct vcpu *);
int user_mode = !(v->arch.flags & TF_kernel_mode);
#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
#elif defined(__i386__)
@@ -964,16 +973,26 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
case 0x30: /* WRMSR */
/* Ignore the instruction if unprivileged. */
if ( !IS_PRIV(v->domain) )
- DPRINTK("Non-priv domain attempted WRMSR(%p,%08lx,%08lx).\n",
- _p(regs->ecx), (long)regs->eax, (long)regs->edx);
+ {
+ u32 l, h;
+ if ( (rdmsr_user(regs->ecx, l, h) != 0) ||
+ (regs->ecx != MSR_EFER) ||
+ (regs->eax != l) || (regs->edx != h) )
+ DPRINTK("Non-priv domain attempted WRMSR %p from "
+ "%08x:%08x to %08lx:%08lx.\n",
+ _p(regs->ecx), h, l, (long)regs->edx, (long)regs->eax);
+ }
else if ( wrmsr_user(regs->ecx, regs->eax, regs->edx) )
goto fail;
break;
case 0x32: /* RDMSR */
if ( !IS_PRIV(v->domain) )
- DPRINTK("Non-priv domain attempted RDMSR(%p,%08lx,%08lx).\n",
- _p(regs->ecx), (long)regs->eax, (long)regs->edx);
+ {
+ if ( regs->ecx != MSR_EFER )
+ DPRINTK("Non-priv domain attempted RDMSR %p.\n",
+ _p(regs->ecx));
+ }
/* Everyone can read the MSR space. */
if ( rdmsr_user(regs->ecx, regs->eax, regs->edx) )
goto fail;
@@ -1080,26 +1099,23 @@ asmlinkage int do_general_protection(struct cpu_user_regs *regs)
return 0;
}
+static void nmi_softirq(void)
+{
+ /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
+ evtchn_notify(dom0->vcpu[0]);
+}
-/* Defer dom0 notification to softirq context (unsafe in NMI context). */
-static unsigned long nmi_dom0_softirq_reason;
-#define NMI_DOM0_PARITY_ERR 0
-#define NMI_DOM0_IO_ERR 1
-#define NMI_DOM0_UNKNOWN 2
-
-static void nmi_dom0_softirq(void)
+static void nmi_dom0_report(unsigned int reason_idx)
{
- if ( dom0 == NULL )
- return;
+ struct domain *d;
- if ( test_and_clear_bit(NMI_DOM0_PARITY_ERR, &nmi_dom0_softirq_reason) )
- send_guest_virq(dom0->vcpu[0], VIRQ_PARITY_ERR);
+ if ( (d = dom0) == NULL )
+ return;
- if ( test_and_clear_bit(NMI_DOM0_IO_ERR, &nmi_dom0_softirq_reason) )
- send_guest_virq(dom0->vcpu[0], VIRQ_IO_ERR);
+ set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
- if ( test_and_clear_bit(NMI_DOM0_UNKNOWN, &nmi_dom0_softirq_reason) )
- send_guest_virq(dom0->vcpu[0], VIRQ_NMI);
+ if ( test_and_set_bit(_VCPUF_nmi_pending, &d->vcpu[0]->vcpu_flags) )
+ raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
}
asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
@@ -1107,8 +1123,7 @@ asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
switch ( opt_nmi[0] )
{
case 'd': /* 'dom0' */
- set_bit(NMI_DOM0_PARITY_ERR, &nmi_dom0_softirq_reason);
- raise_softirq(NMI_DOM0_SOFTIRQ);
+ nmi_dom0_report(_XEN_NMIREASON_parity_error);
case 'i': /* 'ignore' */
break;
default: /* 'fatal' */
@@ -1127,8 +1142,7 @@ asmlinkage void io_check_error(struct cpu_user_regs *regs)
switch ( opt_nmi[0] )
{
case 'd': /* 'dom0' */
- set_bit(NMI_DOM0_IO_ERR, &nmi_dom0_softirq_reason);
- raise_softirq(NMI_DOM0_SOFTIRQ);
+ nmi_dom0_report(_XEN_NMIREASON_io_error);
case 'i': /* 'ignore' */
break;
default: /* 'fatal' */
@@ -1147,8 +1161,7 @@ static void unknown_nmi_error(unsigned char reason)
switch ( opt_nmi[0] )
{
case 'd': /* 'dom0' */
- set_bit(NMI_DOM0_UNKNOWN, &nmi_dom0_softirq_reason);
- raise_softirq(NMI_DOM0_SOFTIRQ);
+ nmi_dom0_report(_XEN_NMIREASON_unknown);
case 'i': /* 'ignore' */
break;
default: /* 'fatal' */
@@ -1347,7 +1360,7 @@ void __init trap_init(void)
cpu_init();
- open_softirq(NMI_DOM0_SOFTIRQ, nmi_dom0_softirq);
+ open_softirq(NMI_SOFTIRQ, nmi_softirq);
}
diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c
index 3cb18be4c2..6d6fa51764 100644
--- a/xen/arch/x86/vmx.c
+++ b/xen/arch/x86/vmx.c
@@ -42,7 +42,7 @@
#include <asm/shadow_64.h>
#endif
#include <public/sched.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <asm/vmx_vpic.h>
#include <asm/vmx_vlapic.h>
@@ -53,7 +53,7 @@ unsigned int opt_vmx_debug_level = 0;
integer_param("vmx_debug", opt_vmx_debug_level);
static unsigned long trace_values[NR_CPUS][4];
-#define TRACE_VMEXIT(index,value) trace_values[current->processor][index]=value
+#define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value
static int vmx_switch_on;
@@ -66,11 +66,6 @@ void vmx_final_setup_guest(struct vcpu *v)
struct domain *d = v->domain;
struct vcpu *vc;
- d->arch.vmx_platform.lapic_enable = v->arch.guest_context.user_regs.ecx;
- v->arch.guest_context.user_regs.ecx = 0;
- VMX_DBG_LOG(DBG_LEVEL_VLAPIC, "lapic enable is %d.\n",
- d->arch.vmx_platform.lapic_enable);
-
/* Initialize monitor page table */
for_each_vcpu(d, vc)
vc->arch.monitor_table = mk_pagetable(0);
@@ -95,7 +90,7 @@ void vmx_final_setup_guest(struct vcpu *v)
void vmx_relinquish_resources(struct vcpu *v)
{
struct vmx_virpit *vpit;
-
+
if ( !VMX_DOMAIN(v) )
return;
@@ -103,19 +98,18 @@ void vmx_relinquish_resources(struct vcpu *v)
/* unmap IO shared page */
struct domain *d = v->domain;
if ( d->arch.vmx_platform.shared_page_va )
- unmap_domain_page((void *)d->arch.vmx_platform.shared_page_va);
+ unmap_domain_page_global(
+ (void *)d->arch.vmx_platform.shared_page_va);
}
destroy_vmcs(&v->arch.arch_vmx);
free_monitor_pagetable(v);
vpit = &v->domain->arch.vmx_platform.vmx_pit;
- if ( active_ac_timer(&(vpit->pit_timer)) )
- rem_ac_timer(&vpit->pit_timer);
- if ( active_ac_timer(&v->arch.arch_vmx.hlt_timer) )
- rem_ac_timer(&v->arch.arch_vmx.hlt_timer);
+ kill_timer(&vpit->pit_timer);
+ kill_timer(&v->arch.arch_vmx.hlt_timer);
if ( vmx_apic_support(v->domain) && (VLAPIC(v) != NULL) )
{
- rem_ac_timer(&VLAPIC(v)->vlapic_timer);
+ kill_timer(&VLAPIC(v)->vlapic_timer);
xfree(VLAPIC(v));
}
}
@@ -1604,7 +1598,7 @@ void vmx_vmexit_do_hlt(void)
next_wakeup = next_pit;
}
if ( next_wakeup != - 1 )
- set_ac_timer(&current->arch.arch_vmx.hlt_timer, next_wakeup);
+ set_timer(&current->arch.arch_vmx.hlt_timer, next_wakeup);
do_block();
}
@@ -1955,9 +1949,12 @@ asmlinkage void load_cr2(void)
asmlinkage void trace_vmentry (void)
{
- TRACE_5D(TRC_VMENTRY,trace_values[current->processor][0],
- trace_values[current->processor][1],trace_values[current->processor][2],
- trace_values[current->processor][3],trace_values[current->processor][4]);
+ TRACE_5D(TRC_VMENTRY,
+ trace_values[smp_processor_id()][0],
+ trace_values[smp_processor_id()][1],
+ trace_values[smp_processor_id()][2],
+ trace_values[smp_processor_id()][3],
+ trace_values[smp_processor_id()][4]);
TRACE_VMEXIT(0,9);
TRACE_VMEXIT(1,9);
TRACE_VMEXIT(2,9);
diff --git a/xen/arch/x86/vmx_intercept.c b/xen/arch/x86/vmx_intercept.c
index 8bac8a8e5c..419960842c 100644
--- a/xen/arch/x86/vmx_intercept.c
+++ b/xen/arch/x86/vmx_intercept.c
@@ -24,7 +24,7 @@
#include <asm/vmx_vpit.h>
#include <asm/vmx_intercept.h>
#include <asm/vmx_vlapic.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <xen/lib.h>
#include <xen/sched.h>
#include <asm/current.h>
@@ -356,19 +356,19 @@ static void pit_timer_fn(void *data)
vpit->pending_intr_nr++;
if ( test_bit(_VCPUF_running, &v->vcpu_flags) ) {
vpit->scheduled += vpit->period;
- set_ac_timer(&vpit->pit_timer, vpit->scheduled);
+ set_timer(&vpit->pit_timer, vpit->scheduled);
}
}
void pickup_deactive_ticks(struct vmx_virpit *vpit)
{
- if ( !active_ac_timer(&(vpit->pit_timer)) ) {
+ if ( !active_timer(&(vpit->pit_timer)) ) {
/* pick up missed timer tick */
missed_ticks(vpit);
vpit->scheduled += vpit->period;
- set_ac_timer(&vpit->pit_timer, vpit->scheduled);
+ set_timer(&vpit->pit_timer, vpit->scheduled);
}
}
@@ -385,14 +385,14 @@ void vmx_hooks_assist(struct vcpu *v)
/* load init count*/
if (p->state == STATE_IORESP_HOOK) {
/* set up actimer, handle re-init */
- if ( active_ac_timer(&(vpit->pit_timer)) ) {
+ if ( active_timer(&(vpit->pit_timer)) ) {
VMX_DBG_LOG(DBG_LEVEL_1, "VMX_PIT: guest reset PIT with channel %lx!\n", (unsigned long) ((p->u.data >> 24) & 0x3) );
- rem_ac_timer(&(vpit->pit_timer));
+ stop_timer(&(vpit->pit_timer));
reinit = 1;
}
else {
- init_ac_timer(&vpit->pit_timer, pit_timer_fn, v, v->processor);
+ init_timer(&vpit->pit_timer, pit_timer_fn, v, v->processor);
}
/* init count for this channel */
@@ -431,7 +431,7 @@ void vmx_hooks_assist(struct vcpu *v)
}
vpit->scheduled = NOW() + vpit->period;
- set_ac_timer(&vpit->pit_timer, vpit->scheduled);
+ set_timer(&vpit->pit_timer, vpit->scheduled);
/*restore the state*/
p->state = STATE_IORESP_READY;
diff --git a/xen/arch/x86/vmx_io.c b/xen/arch/x86/vmx_io.c
index b7689228bf..c979a8d741 100644
--- a/xen/arch/x86/vmx_io.c
+++ b/xen/arch/x86/vmx_io.c
@@ -37,7 +37,7 @@
#include <asm/shadow.h>
#include <asm/vmx_vpic.h>
#include <asm/vmx_vlapic.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#ifdef CONFIG_VMX
#if defined (__i386__)
@@ -819,7 +819,7 @@ interrupt_post_injection(struct vcpu * v, int vector, int type)
if ( !vpit->first_injected ) {
vpit->pending_intr_nr = 0;
vpit->scheduled = NOW() + vpit->period;
- set_ac_timer(&vpit->pit_timer, vpit->scheduled);
+ set_timer(&vpit->pit_timer, vpit->scheduled);
vpit->first_injected = 1;
} else {
vpit->pending_intr_nr--;
diff --git a/xen/arch/x86/vmx_platform.c b/xen/arch/x86/vmx_platform.c
index 2ee14c65ec..45d1e0052b 100644
--- a/xen/arch/x86/vmx_platform.c
+++ b/xen/arch/x86/vmx_platform.c
@@ -27,7 +27,7 @@
#include <xen/trace.h>
#include <asm/vmx.h>
#include <asm/vmx_platform.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#include <xen/lib.h>
#include <xen/sched.h>
diff --git a/xen/arch/x86/vmx_vlapic.c b/xen/arch/x86/vmx_vlapic.c
index fa1dc2118d..d487f9739e 100644
--- a/xen/arch/x86/vmx_vlapic.c
+++ b/xen/arch/x86/vmx_vlapic.c
@@ -32,7 +32,7 @@
#include <xen/lib.h>
#include <xen/sched.h>
#include <asm/current.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/ioreq.h>
#ifdef CONFIG_VMX
@@ -62,7 +62,7 @@ int vlapic_find_highest_irr(struct vlapic *vlapic)
int vmx_apic_support(struct domain *d)
{
- return d->arch.vmx_platform.lapic_enable;
+ return d->arch.vmx_platform.apic_enabled;
}
s_time_t get_apictime_scheduled(struct vcpu *v)
@@ -391,7 +391,7 @@ static void vlapic_begin_timer(struct vlapic *vlapic)
(262144 / get_apic_bus_scale()) * vlapic->timer_divide_counter;
vlapic->vlapic_timer.expires = cur + offset;
- set_ac_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires );
+ set_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires );
VMX_DBG_LOG(DBG_LEVEL_VLAPIC, "vlapic_begin_timer: "
"bus_scale %x now %08x%08x expire %08x%08x "
@@ -739,7 +739,7 @@ static void vlapic_write(struct vcpu *v, unsigned long address,
case APIC_TMICT:
if (vlapic_timer_active(vlapic))
- rem_ac_timer(&(vlapic->vlapic_timer));
+ stop_timer(&(vlapic->vlapic_timer));
vlapic->timer_initial = val;
vlapic->timer_current = val;
@@ -846,7 +846,7 @@ void vlapic_timer_fn(void *data)
vlapic->timer_current = vlapic->timer_initial;
offset = vlapic->timer_current * (262144/get_apic_bus_scale()) * vlapic->timer_divide_counter;
vlapic->vlapic_timer.expires = NOW() + offset;
- set_ac_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires);
+ set_timer(&(vlapic->vlapic_timer), vlapic->vlapic_timer.expires);
}else {
vlapic->timer_current = 0;
}
@@ -986,7 +986,7 @@ static int vlapic_reset(struct vlapic *vlapic)
vmx_vioapic_add_lapic(vlapic, v);
- init_ac_timer(&vlapic->vlapic_timer,
+ init_timer(&vlapic->vlapic_timer,
vlapic_timer_fn, vlapic, v->processor);
#ifdef VLAPIC_NO_BIOS
diff --git a/xen/arch/x86/vmx_vmcs.c b/xen/arch/x86/vmx_vmcs.c
index 17eb2caad3..9b7c9d41d2 100644
--- a/xen/arch/x86/vmx_vmcs.c
+++ b/xen/arch/x86/vmx_vmcs.c
@@ -32,7 +32,7 @@
#include <asm/flushtlb.h>
#include <xen/event.h>
#include <xen/kernel.h>
-#include <public/io/ioreq.h>
+#include <public/hvm/hvm_info_table.h>
#if CONFIG_PAGING_LEVELS >= 4
#include <asm/shadow_64.h>
#endif
@@ -193,7 +193,7 @@ static void vmx_map_io_shared_page(struct domain *d)
domain_crash_synchronous();
}
- p = map_domain_page(mpfn);
+ p = map_domain_page_global(mpfn);
if (p == NULL) {
printk("Can not map io request shared page for VMX domain.\n");
domain_crash_synchronous();
@@ -206,35 +206,55 @@ static void vmx_map_io_shared_page(struct domain *d)
&d->shared_info->evtchn_mask[0]);
}
-#define VCPU_NR_PAGE 0x0009F000
-#define VCPU_NR_OFFSET 0x00000800
-#define VCPU_MAGIC 0x76637075 /* "vcpu" */
+static int validate_hvm_info(struct hvm_info_table *t)
+{
+ char signature[] = "HVM INFO";
+ uint8_t *ptr = (uint8_t *)t;
+ uint8_t sum = 0;
+ int i;
+
+ /* strncmp(t->signature, "HVM INFO", 8) */
+ for ( i = 0; i < 8; i++ ) {
+ if ( signature[i] != t->signature[i] ) {
+ printk("Bad hvm info signature\n");
+ return 0;
+ }
+ }
+
+ for ( i = 0; i < t->length; i++ )
+ sum += ptr[i];
-static void vmx_set_vcpu_nr(struct domain *d)
+ return (sum == 0);
+}
+
+static void vmx_get_hvm_info(struct domain *d)
{
unsigned char *p;
unsigned long mpfn;
- unsigned int *vcpus;
+ struct hvm_info_table *t;
- mpfn = get_mfn_from_pfn(VCPU_NR_PAGE >> PAGE_SHIFT);
- if (mpfn == INVALID_MFN) {
- printk("Can not get vcpu number page mfn for VMX domain.\n");
+ mpfn = get_mfn_from_pfn(HVM_INFO_PFN);
+ if ( mpfn == INVALID_MFN ) {
+ printk("Can not get hvm info page mfn for VMX domain.\n");
domain_crash_synchronous();
}
p = map_domain_page(mpfn);
- if (p == NULL) {
- printk("Can not map vcpu number page for VMX domain.\n");
+ if ( p == NULL ) {
+ printk("Can not map hvm info page for VMX domain.\n");
domain_crash_synchronous();
}
- vcpus = (unsigned int *)(p + VCPU_NR_OFFSET);
- if (vcpus[0] != VCPU_MAGIC) {
- printk("Bad vcpus magic, set vcpu number to 1 by default.\n");
- d->arch.vmx_platform.nr_vcpu = 1;
- }
+ t = (struct hvm_info_table *)(p + HVM_INFO_OFFSET);
- d->arch.vmx_platform.nr_vcpu = vcpus[1];
+ if ( validate_hvm_info(t) ) {
+ d->arch.vmx_platform.nr_vcpus = t->nr_vcpus;
+ d->arch.vmx_platform.apic_enabled = t->apic_enabled;
+ } else {
+ printk("Bad hvm info table\n");
+ d->arch.vmx_platform.nr_vcpus = 1;
+ d->arch.vmx_platform.apic_enabled = 0;
+ }
unmap_domain_page(p);
}
@@ -244,10 +264,10 @@ static void vmx_setup_platform(struct domain* d)
struct vmx_platform *platform;
vmx_map_io_shared_page(d);
- vmx_set_vcpu_nr(d);
+ vmx_get_hvm_info(d);
platform = &d->arch.vmx_platform;
- pic_init(&platform->vmx_pic, pic_irq_request,
+ pic_init(&platform->vmx_pic, pic_irq_request,
&platform->interrupt_request);
register_pic_io_hook();
@@ -321,7 +341,7 @@ static void vmx_do_launch(struct vcpu *v)
vlapic_init(v);
vmx_set_host_env(v);
- init_ac_timer(&v->arch.arch_vmx.hlt_timer, hlt_timer_fn, v, v->processor);
+ init_timer(&v->arch.arch_vmx.hlt_timer, hlt_timer_fn, v, v->processor);
error |= __vmwrite(GUEST_LDTR_SELECTOR, 0);
error |= __vmwrite(GUEST_LDTR_BASE, 0);
@@ -335,6 +355,8 @@ static void vmx_do_launch(struct vcpu *v)
__vmwrite(HOST_RSP, (unsigned long)get_stack_bottom());
v->arch.schedule_tail = arch_vmx_do_resume;
+ v->arch.arch_vmx.launch_cpu = smp_processor_id();
+
/* init guest tsc to start from 0 */
rdtscll(host_tsc);
v->arch.arch_vmx.tsc_offset = 0 - host_tsc;
@@ -617,11 +639,21 @@ void vm_resume_fail(unsigned long eflags)
void arch_vmx_do_resume(struct vcpu *v)
{
- u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
-
- load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr);
- vmx_do_resume(v);
- reset_stack_and_jump(vmx_asm_do_resume);
+ if ( v->arch.arch_vmx.launch_cpu == smp_processor_id() )
+ {
+ load_vmcs(&v->arch.arch_vmx, virt_to_phys(v->arch.arch_vmx.vmcs));
+ vmx_do_resume(v);
+ reset_stack_and_jump(vmx_asm_do_resume);
+ }
+ else
+ {
+ __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs));
+ load_vmcs(&v->arch.arch_vmx, virt_to_phys(v->arch.arch_vmx.vmcs));
+ vmx_do_resume(v);
+ vmx_set_host_env(v);
+ v->arch.arch_vmx.launch_cpu = smp_processor_id();
+ reset_stack_and_jump(vmx_asm_do_relaunch);
+ }
}
void arch_vmx_do_launch(struct vcpu *v)
@@ -643,18 +675,6 @@ void arch_vmx_do_launch(struct vcpu *v)
reset_stack_and_jump(vmx_asm_do_launch);
}
-void arch_vmx_do_relaunch(struct vcpu *v)
-{
- u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
-
- load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr);
- vmx_do_resume(v);
- vmx_set_host_env(v);
- v->arch.schedule_tail = arch_vmx_do_resume;
-
- reset_stack_and_jump(vmx_asm_do_relaunch);
-}
-
#endif /* CONFIG_VMX */
/*
diff --git a/xen/arch/x86/x86_32/asm-offsets.c b/xen/arch/x86/x86_32/asm-offsets.c
index 3a5c3ef9f8..42bef57240 100644
--- a/xen/arch/x86/x86_32/asm-offsets.c
+++ b/xen/arch/x86/x86_32/asm-offsets.c
@@ -65,6 +65,10 @@ void __dummy__(void)
arch.guest_context.kernel_ss);
OFFSET(VCPU_kernel_sp, struct vcpu,
arch.guest_context.kernel_sp);
+ OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
+ OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
+ DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
+ DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
BLANK();
OFFSET(VCPUINFO_upcall_pending, vcpu_info_t, evtchn_upcall_pending);
diff --git a/xen/arch/x86/x86_32/domain_page.c b/xen/arch/x86/x86_32/domain_page.c
index f7c194b775..222e813693 100644
--- a/xen/arch/x86/x86_32/domain_page.c
+++ b/xen/arch/x86/x86_32/domain_page.c
@@ -1,14 +1,9 @@
/******************************************************************************
* domain_page.h
*
- * Allow temporary mapping of domain pages. Based on ideas from the
- * Linux PKMAP code -- the copyrights and credits are retained below.
- */
-
-/*
- * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
- * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de *
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Allow temporary mapping of domain pages.
+ *
+ * Copyright (c) 2003-2006, Keir Fraser <keir@xensource.com>
*/
#include <xen/config.h>
@@ -20,80 +15,203 @@
#include <asm/flushtlb.h>
#include <asm/hardirq.h>
-#define MAPCACHE_ORDER 10
-#define MAPCACHE_ENTRIES (1 << MAPCACHE_ORDER)
-
-l1_pgentry_t *mapcache;
-static unsigned int map_idx, epoch, shadow_epoch[NR_CPUS];
-static spinlock_t map_lock = SPIN_LOCK_UNLOCKED;
-
-/* Use a spare PTE bit to mark entries ready for recycling. */
-#define READY_FOR_TLB_FLUSH (1<<10)
-
-static void flush_all_ready_maps(void)
-{
- l1_pgentry_t *cache = mapcache;
- unsigned int i;
-
- for ( i = 0; i < MAPCACHE_ENTRIES; i++ )
- if ( (l1e_get_flags(cache[i]) & READY_FOR_TLB_FLUSH) )
- cache[i] = l1e_empty();
-}
-
-void *map_domain_pages(unsigned long pfn, unsigned int order)
+void *map_domain_page(unsigned long pfn)
{
unsigned long va;
- unsigned int idx, i, flags, cpu = smp_processor_id();
- l1_pgentry_t *cache = mapcache;
-#ifndef NDEBUG
- unsigned int flush_count = 0;
-#endif
+ unsigned int idx, i, vcpu = current->vcpu_id;
+ struct domain *d;
+ struct mapcache *cache;
+ struct vcpu_maphash_entry *hashent;
ASSERT(!in_irq());
+
perfc_incrc(map_domain_page_count);
- spin_lock(&map_lock);
+ /* If we are the idle domain, ensure that we run on our own page tables. */
+ d = current->domain;
+ if ( unlikely(is_idle_domain(d)) )
+ __sync_lazy_execstate();
- /* Has some other CPU caused a wrap? We must flush if so. */
- if ( epoch != shadow_epoch[cpu] )
+ cache = &d->arch.mapcache;
+
+ hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)];
+ if ( hashent->pfn == pfn )
{
- perfc_incrc(domain_page_tlb_flush);
- local_flush_tlb();
- shadow_epoch[cpu] = epoch;
+ idx = hashent->idx;
+ hashent->refcnt++;
+ ASSERT(hashent->refcnt != 0);
+ ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn);
+ goto out;
}
- do {
- idx = map_idx = (map_idx + 1) & (MAPCACHE_ENTRIES - 1);
- if ( unlikely(idx == 0) )
+ spin_lock(&cache->lock);
+
+ /* Has some other CPU caused a wrap? We must flush if so. */
+ if ( unlikely(cache->epoch != cache->shadow_epoch[vcpu]) )
+ {
+ cache->shadow_epoch[vcpu] = cache->epoch;
+ if ( NEED_FLUSH(tlbflush_time[smp_processor_id()],
+ cache->tlbflush_timestamp) )
{
- ASSERT(flush_count++ == 0);
- flush_all_ready_maps();
perfc_incrc(domain_page_tlb_flush);
local_flush_tlb();
- shadow_epoch[cpu] = ++epoch;
+ }
+ }
+
+ idx = find_next_zero_bit(cache->inuse, MAPCACHE_ENTRIES, cache->cursor);
+ if ( unlikely(idx >= MAPCACHE_ENTRIES) )
+ {
+ /* /First/, clean the garbage map and update the inuse list. */
+ for ( i = 0; i < ARRAY_SIZE(cache->garbage); i++ )
+ {
+ unsigned long x = xchg(&cache->garbage[i], 0);
+ cache->inuse[i] &= ~x;
}
- flags = 0;
- for ( i = 0; i < (1U << order); i++ )
- flags |= l1e_get_flags(cache[idx+i]);
+ /* /Second/, flush TLBs. */
+ perfc_incrc(domain_page_tlb_flush);
+ local_flush_tlb();
+ cache->shadow_epoch[vcpu] = ++cache->epoch;
+ cache->tlbflush_timestamp = tlbflush_current_time();
+
+ idx = find_first_zero_bit(cache->inuse, MAPCACHE_ENTRIES);
+ ASSERT(idx < MAPCACHE_ENTRIES);
}
- while ( flags & _PAGE_PRESENT );
- for ( i = 0; i < (1U << order); i++ )
- cache[idx+i] = l1e_from_pfn(pfn+i, __PAGE_HYPERVISOR);
+ set_bit(idx, cache->inuse);
+ cache->cursor = idx + 1;
+
+ spin_unlock(&cache->lock);
- spin_unlock(&map_lock);
+ cache->l1tab[idx] = l1e_from_pfn(pfn, __PAGE_HYPERVISOR);
+ out:
va = MAPCACHE_VIRT_START + (idx << PAGE_SHIFT);
return (void *)va;
}
-void unmap_domain_pages(void *va, unsigned int order)
+void unmap_domain_page(void *va)
{
- unsigned int idx, i;
+ unsigned int idx;
+ struct mapcache *cache = &current->domain->arch.mapcache;
+ unsigned long pfn;
+ struct vcpu_maphash_entry *hashent;
+
+ ASSERT(!in_irq());
+
ASSERT((void *)MAPCACHE_VIRT_START <= va);
ASSERT(va < (void *)MAPCACHE_VIRT_END);
+
idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
- for ( i = 0; i < (1U << order); i++ )
- l1e_add_flags(mapcache[idx+i], READY_FOR_TLB_FLUSH);
+ pfn = l1e_get_pfn(cache->l1tab[idx]);
+ hashent = &cache->vcpu_maphash[current->vcpu_id].hash[MAPHASH_HASHFN(pfn)];
+
+ if ( hashent->idx == idx )
+ {
+ ASSERT(hashent->pfn == pfn);
+ ASSERT(hashent->refcnt != 0);
+ hashent->refcnt--;
+ }
+ else if ( hashent->refcnt == 0 )
+ {
+ if ( hashent->idx != MAPHASHENT_NOTINUSE )
+ {
+ /* /First/, zap the PTE. */
+ ASSERT(l1e_get_pfn(cache->l1tab[hashent->idx]) == hashent->pfn);
+ cache->l1tab[hashent->idx] = l1e_empty();
+ /* /Second/, mark as garbage. */
+ set_bit(hashent->idx, cache->garbage);
+ }
+
+ /* Add newly-freed mapping to the maphash. */
+ hashent->pfn = pfn;
+ hashent->idx = idx;
+ }
+ else
+ {
+ /* /First/, zap the PTE. */
+ cache->l1tab[idx] = l1e_empty();
+ /* /Second/, mark as garbage. */
+ set_bit(idx, cache->garbage);
+ }
+}
+
+void mapcache_init(struct domain *d)
+{
+ unsigned int i, j;
+
+ d->arch.mapcache.l1tab = d->arch.mm_perdomain_pt +
+ (GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+ spin_lock_init(&d->arch.mapcache.lock);
+
+ /* Mark all maphash entries as not in use. */
+ for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+ for ( j = 0; j < MAPHASH_ENTRIES; j++ )
+ d->arch.mapcache.vcpu_maphash[i].hash[j].idx =
+ MAPHASHENT_NOTINUSE;
+}
+
+#define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT))
+static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)];
+static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)];
+static unsigned int inuse_cursor;
+static spinlock_t globalmap_lock = SPIN_LOCK_UNLOCKED;
+
+void *map_domain_page_global(unsigned long pfn)
+{
+ l2_pgentry_t *pl2e;
+ l1_pgentry_t *pl1e;
+ unsigned int idx, i;
+ unsigned long va;
+
+ ASSERT(!in_irq() && local_irq_is_enabled());
+
+ spin_lock(&globalmap_lock);
+
+ idx = find_next_zero_bit(inuse, GLOBALMAP_BITS, inuse_cursor);
+ va = IOREMAP_VIRT_START + (idx << PAGE_SHIFT);
+ if ( unlikely(va >= FIXADDR_START) )
+ {
+ /* /First/, clean the garbage map and update the inuse list. */
+ for ( i = 0; i < ARRAY_SIZE(garbage); i++ )
+ {
+ unsigned long x = xchg(&garbage[i], 0);
+ inuse[i] &= ~x;
+ }
+
+ /* /Second/, flush all TLBs to get rid of stale garbage mappings. */
+ flush_tlb_all();
+
+ idx = find_first_zero_bit(inuse, GLOBALMAP_BITS);
+ va = IOREMAP_VIRT_START + (idx << PAGE_SHIFT);
+ ASSERT(va < FIXADDR_START);
+ }
+
+ set_bit(idx, inuse);
+ inuse_cursor = idx + 1;
+
+ spin_unlock(&globalmap_lock);
+
+ pl2e = virt_to_xen_l2e(va);
+ pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(va);
+ *pl1e = l1e_from_pfn(pfn, __PAGE_HYPERVISOR);
+
+ return (void *)va;
+}
+
+void unmap_domain_page_global(void *va)
+{
+ unsigned long __va = (unsigned long)va;
+ l2_pgentry_t *pl2e;
+ l1_pgentry_t *pl1e;
+ unsigned int idx;
+
+ /* /First/, we zap the PTE. */
+ pl2e = virt_to_xen_l2e(__va);
+ pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va);
+ *pl1e = l1e_empty();
+
+ /* /Second/, we add to the garbage map. */
+ idx = (__va - IOREMAP_VIRT_START) >> PAGE_SHIFT;
+ set_bit(idx, garbage);
}
diff --git a/xen/arch/x86/x86_32/entry.S b/xen/arch/x86/x86_32/entry.S
index b890103160..e178d7383e 100644
--- a/xen/arch/x86/x86_32/entry.S
+++ b/xen/arch/x86/x86_32/entry.S
@@ -326,7 +326,9 @@ test_all_events:
shl $IRQSTAT_shift,%eax
test %ecx,irq_stat(%eax,1)
jnz process_softirqs
-/*test_guest_events:*/
+ btr $_VCPUF_nmi_pending,VCPU_flags(%ebx)
+ jc process_nmi
+test_guest_events:
movl VCPU_vcpu_info(%ebx),%eax
testb $0xFF,VCPUINFO_upcall_mask(%eax)
jnz restore_all_guest
@@ -348,7 +350,24 @@ process_softirqs:
sti
call do_softirq
jmp test_all_events
-
+
+ ALIGN
+process_nmi:
+ movl VCPU_nmi_addr(%ebx),%eax
+ test %eax,%eax
+ jz test_all_events
+ bts $_VCPUF_nmi_masked,VCPU_flags(%ebx)
+ jc 1f
+ sti
+ leal VCPU_trap_bounce(%ebx),%edx
+ movl %eax,TRAPBOUNCE_eip(%edx)
+ movw $FLAT_KERNEL_CS,TRAPBOUNCE_cs(%edx)
+ movw $TBF_INTERRUPT,TRAPBOUNCE_flags(%edx)
+ call create_bounce_frame
+ jmp test_all_events
+1: bts $_VCPUF_nmi_pending,VCPU_flags(%ebx)
+ jmp test_guest_events
+
/* CREATE A BASIC EXCEPTION FRAME ON GUEST OS (RING-1) STACK: */
/* {EIP, CS, EFLAGS, [ESP, SS]} */
/* %edx == trap_bounce, %ebx == struct vcpu */
@@ -620,9 +639,7 @@ ENTRY(nmi)
jne defer_nmi
continue_nmi:
- movl $(__HYPERVISOR_DS),%edx
- movl %edx,%ds
- movl %edx,%es
+ SET_XEN_SEGMENTS(d)
movl %esp,%edx
pushl %edx
call do_nmi
@@ -660,42 +677,6 @@ do_arch_sched_op:
movl %eax,UREGS_eax(%ecx)
jmp do_sched_op
-do_switch_vm86:
- # Reset the stack pointer
- GET_GUEST_REGS(%ecx)
- movl %ecx,%esp
-
- # GS:ESI == Ring-1 stack activation
- movl UREGS_esp(%esp),%esi
-VFLT1: mov UREGS_ss(%esp),%gs
-
- # ES:EDI == Ring-0 stack activation
- leal UREGS_eip(%esp),%edi
-
- # Restore the hypercall-number-clobbered EAX on our stack frame
-VFLT2: movl %gs:(%esi),%eax
- movl %eax,UREGS_eax(%esp)
- addl $4,%esi
-
- # Copy the VM86 activation from the ring-1 stack to the ring-0 stack
- movl $(UREGS_user_sizeof-UREGS_eip)/4,%ecx
-VFLT3: movl %gs:(%esi),%eax
- stosl
- addl $4,%esi
- loop VFLT3
-
- # Fix up EFLAGS: IOPL=0, IF=1, VM=1
- andl $~X86_EFLAGS_IOPL,UREGS_eflags(%esp)
- orl $X86_EFLAGS_IF|X86_EFLAGS_VM,UREGS_eflags(%esp)
-
- jmp test_all_events
-
-.section __ex_table,"a"
- .long VFLT1,domain_crash_synchronous
- .long VFLT2,domain_crash_synchronous
- .long VFLT3,domain_crash_synchronous
-.previous
-
.data
ENTRY(exception_table)
@@ -744,11 +725,12 @@ ENTRY(hypercall_table)
.long do_grant_table_op /* 20 */
.long do_vm_assist
.long do_update_va_mapping_otherdomain
- .long do_switch_vm86
+ .long do_iret
.long do_vcpu_op
.long do_ni_hypercall /* 25 */
.long do_mmuext_op
- .long do_acm_op /* 27 */
+ .long do_acm_op
+ .long do_nmi_op
.rept NR_hypercalls-((.-hypercall_table)/4)
.long do_ni_hypercall
.endr
@@ -777,11 +759,12 @@ ENTRY(hypercall_args_table)
.byte 3 /* do_grant_table_op */ /* 20 */
.byte 2 /* do_vm_assist */
.byte 5 /* do_update_va_mapping_otherdomain */
- .byte 0 /* do_switch_vm86 */
+ .byte 0 /* do_iret */
.byte 3 /* do_vcpu_op */
.byte 0 /* do_ni_hypercall */ /* 25 */
.byte 4 /* do_mmuext_op */
.byte 1 /* do_acm_op */
+ .byte 2 /* do_nmi_op */
.rept NR_hypercalls-(.-hypercall_args_table)
.byte 0 /* do_ni_hypercall */
.endr
diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c
index 4be333f4cf..95def3f2b4 100644
--- a/xen/arch/x86/x86_32/mm.c
+++ b/xen/arch/x86/x86_32/mm.c
@@ -29,8 +29,6 @@
#include <asm/fixmap.h>
#include <public/memory.h>
-extern l1_pgentry_t *mapcache;
-
unsigned int PAGE_HYPERVISOR = __PAGE_HYPERVISOR;
unsigned int PAGE_HYPERVISOR_NOCACHE = __PAGE_HYPERVISOR_NOCACHE;
@@ -68,7 +66,7 @@ void __init paging_init(void)
void *ioremap_pt;
unsigned long v;
struct pfn_info *pg;
- int i, mapcache_order;
+ int i;
#ifdef CONFIG_X86_PAE
printk("PAE enabled, limit: %d GB\n", MACHPHYS_MBYTES);
@@ -76,7 +74,7 @@ void __init paging_init(void)
printk("PAE disabled.\n");
#endif
- idle0_vcpu.arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
+ idle_vcpu[0]->arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
if ( cpu_has_pge )
{
@@ -121,14 +119,12 @@ void __init paging_init(void)
l2e_from_page(virt_to_page(ioremap_pt), __PAGE_HYPERVISOR);
}
- /* Set up mapping cache for domain pages. */
- mapcache_order = get_order_from_bytes(
- MAPCACHE_MBYTES << (20 - PAGETABLE_ORDER));
- mapcache = alloc_xenheap_pages(mapcache_order);
- memset(mapcache, 0, PAGE_SIZE << mapcache_order);
- for ( i = 0; i < (MAPCACHE_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
- idle_pg_table_l2[l2_linear_offset(MAPCACHE_VIRT_START) + i] =
- l2e_from_page(virt_to_page(mapcache) + i, __PAGE_HYPERVISOR);
+ /* Install per-domain mappings for idle domain. */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ idle_pg_table_l2[l2_linear_offset(PERDOMAIN_VIRT_START) + i] =
+ l2e_from_page(virt_to_page(idle_vcpu[0]->domain->
+ arch.mm_perdomain_pt) + i,
+ __PAGE_HYPERVISOR);
}
void __init zap_low_mappings(l2_pgentry_t *base)
diff --git a/xen/arch/x86/x86_32/traps.c b/xen/arch/x86/x86_32/traps.c
index cb2b7b9eaa..95b69a14bd 100644
--- a/xen/arch/x86/x86_32/traps.c
+++ b/xen/arch/x86/x86_32/traps.c
@@ -157,6 +157,64 @@ asmlinkage void do_double_fault(void)
__asm__ __volatile__ ( "hlt" );
}
+asmlinkage unsigned long do_iret(void)
+{
+ struct cpu_user_regs *regs = guest_cpu_user_regs();
+ u32 eflags;
+
+ /* Check worst-case stack frame for overlap with Xen protected area. */
+ if ( unlikely(!access_ok(regs->esp, 40)) )
+ domain_crash_synchronous();
+
+ /* Pop and restore EAX (clobbered by hypercall). */
+ if ( unlikely(__copy_from_user(&regs->eax, (void __user *)regs->esp, 4)) )
+ domain_crash_synchronous();
+ regs->esp += 4;
+
+ /* Pop and restore CS and EIP. */
+ if ( unlikely(__copy_from_user(&regs->eip, (void __user *)regs->esp, 8)) )
+ domain_crash_synchronous();
+ regs->esp += 8;
+
+ /*
+ * Pop, fix up and restore EFLAGS. We fix up in a local staging area
+ * to avoid firing the BUG_ON(IOPL) check in arch_getdomaininfo_ctxt.
+ */
+ if ( unlikely(__copy_from_user(&eflags, (void __user *)regs->esp, 4)) )
+ domain_crash_synchronous();
+ regs->esp += 4;
+ regs->eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF;
+
+ if ( VM86_MODE(regs) )
+ {
+ /* Return to VM86 mode: pop and restore ESP,SS,ES,DS,FS and GS. */
+ if ( __copy_from_user(&regs->esp, (void __user *)regs->esp, 24) )
+ domain_crash_synchronous();
+ }
+ else if ( unlikely(RING_0(regs)) )
+ {
+ domain_crash_synchronous();
+ }
+ else if ( !RING_1(regs) )
+ {
+ /* Return to ring 2/3: pop and restore ESP and SS. */
+ if ( __copy_from_user(&regs->esp, (void __user *)regs->esp, 8) )
+ domain_crash_synchronous();
+ }
+
+ /* No longer in NMI context. */
+ clear_bit(_VCPUF_nmi_masked, &current->vcpu_flags);
+
+ /* Restore upcall mask from saved value. */
+ current->vcpu_info->evtchn_upcall_mask = regs->saved_upcall_mask;
+
+ /*
+ * The hypercall exit path will overwrite EAX with this return
+ * value.
+ */
+ return regs->eax;
+}
+
BUILD_SMP_INTERRUPT(deferred_nmi, TRAP_deferred_nmi)
asmlinkage void smp_deferred_nmi(struct cpu_user_regs regs)
{
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index c7a3e6025c..0aa20ccabb 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -65,6 +65,10 @@ void __dummy__(void)
arch.guest_context.syscall_callback_eip);
OFFSET(VCPU_kernel_sp, struct vcpu,
arch.guest_context.kernel_sp);
+ OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
+ OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
+ DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
+ DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
BLANK();
OFFSET(VCPUINFO_upcall_pending, vcpu_info_t, evtchn_upcall_pending);
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 3c5c344a1a..88fe273bab 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -171,7 +171,9 @@ test_all_events:
leaq irq_stat(%rip),%rcx
testl $~0,(%rcx,%rax,1)
jnz process_softirqs
-/*test_guest_events:*/
+ btr $_VCPUF_nmi_pending,VCPU_flags(%rbx)
+ jc process_nmi
+test_guest_events:
movq VCPU_vcpu_info(%rbx),%rax
testb $0xFF,VCPUINFO_upcall_mask(%rax)
jnz restore_all_guest
@@ -322,6 +324,23 @@ process_softirqs:
call do_softirq
jmp test_all_events
+ ALIGN
+/* %rbx: struct vcpu */
+process_nmi:
+ movq VCPU_nmi_addr(%rbx),%rax
+ test %rax,%rax
+ jz test_all_events
+ bts $_VCPUF_nmi_masked,VCPU_flags(%rbx)
+ jc 1f
+ sti
+ leaq VCPU_trap_bounce(%rbx),%rdx
+ movq %rax,TRAPBOUNCE_eip(%rdx)
+ movw $(TBF_INTERRUPT|TBF_SLOW_IRET),TRAPBOUNCE_flags(%rdx)
+ call create_bounce_frame
+ jmp test_all_events
+1: bts $_VCPUF_nmi_pending,VCPU_flags(%rbx)
+ jmp test_guest_events
+
/* CREATE A BASIC EXCEPTION FRAME ON GUEST OS STACK: */
/* { RCX, R11, [DS-GS,] [CR2,] [ERRCODE,] RIP, CS, RFLAGS, RSP, SS } */
/* %rdx: trap_bounce, %rbx: struct vcpu */
@@ -339,6 +358,9 @@ create_bounce_frame:
1: /* In kernel context already: push new frame at existing %rsp. */
movq UREGS_rsp+8(%rsp),%rsi
andb $0xfc,UREGS_cs+8(%rsp) # Indicate kernel context to guest.
+ testw $(TBF_SLOW_IRET),TRAPBOUNCE_flags(%rdx)
+ jz 2f
+ orb $0x01,UREGS_cs+8(%rsp)
2: andq $~0xf,%rsi # Stack frames are 16-byte aligned.
movq $HYPERVISOR_VIRT_START,%rax
cmpq %rax,%rsi
@@ -569,7 +591,7 @@ ENTRY(nmi)
SAVE_ALL
movq %rsp,%rdi
call do_nmi
- jmp restore_all_xen
+ jmp ret_from_intr
do_arch_sched_op:
# Ensure we return success even if we return via schedule_tail()
@@ -626,11 +648,12 @@ ENTRY(hypercall_table)
.quad do_grant_table_op /* 20 */
.quad do_vm_assist
.quad do_update_va_mapping_otherdomain
- .quad do_switch_to_user
+ .quad do_iret
.quad do_vcpu_op
.quad do_set_segment_base /* 25 */
.quad do_mmuext_op
.quad do_acm_op
+ .quad do_nmi_op
.rept NR_hypercalls-((.-hypercall_table)/4)
.quad do_ni_hypercall
.endr
@@ -659,11 +682,12 @@ ENTRY(hypercall_args_table)
.byte 3 /* do_grant_table_op */ /* 20 */
.byte 2 /* do_vm_assist */
.byte 4 /* do_update_va_mapping_otherdomain */
- .byte 0 /* do_switch_to_user */
+ .byte 0 /* do_iret */
.byte 3 /* do_vcpu_op */
.byte 2 /* do_set_segment_base */ /* 25 */
.byte 4 /* do_mmuext_op */
.byte 1 /* do_acm_op */
+ .byte 2 /* do_nmi_op */
.rept NR_hypercalls-(.-hypercall_args_table)
.byte 0 /* do_ni_hypercall */
.endr
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 08e0f88bb8..085fb4d22e 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -80,7 +80,7 @@ void __init paging_init(void)
l2_pgentry_t *l2_ro_mpt;
struct pfn_info *pg;
- idle0_vcpu.arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
+ idle_vcpu[0]->arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
/* Create user-accessible L2 directory to map the MPT for guests. */
l3_ro_mpt = alloc_xenheap_page();
@@ -119,6 +119,12 @@ void __init paging_init(void)
/* Set up linear page table mapping. */
idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)] =
l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR);
+
+ /* Install per-domain mappings for idle domain. */
+ idle_pg_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
+ l4e_from_page(
+ virt_to_page(idle_vcpu[0]->domain->arch.mm_perdomain_l3),
+ __PAGE_HYPERVISOR);
}
void __init zap_low_mappings(void)
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index 4f7c822ef8..9756c54589 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -12,6 +12,7 @@
#include <asm/current.h>
#include <asm/flushtlb.h>
#include <asm/msr.h>
+#include <asm/shadow.h>
#include <asm/vmx.h>
void show_registers(struct cpu_user_regs *regs)
@@ -113,6 +114,52 @@ asmlinkage void do_double_fault(struct cpu_user_regs *regs)
__asm__ __volatile__ ( "hlt" );
}
+void toggle_guest_mode(struct vcpu *v)
+{
+ v->arch.flags ^= TF_kernel_mode;
+ __asm__ __volatile__ ( "swapgs" );
+ update_pagetables(v);
+ write_ptbase(v);
+}
+
+long do_iret(void)
+{
+ struct cpu_user_regs *regs = guest_cpu_user_regs();
+ struct iret_context iret_saved;
+ struct vcpu *v = current;
+
+ if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
+ sizeof(iret_saved))) )
+ domain_crash_synchronous();
+
+ /* Returning to user mode? */
+ if ( (iret_saved.cs & 3) == 3 )
+ {
+ if ( unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
+ return -EFAULT;
+ toggle_guest_mode(v);
+ }
+
+ regs->rip = iret_saved.rip;
+ regs->cs = iret_saved.cs | 3; /* force guest privilege */
+ regs->rflags = (iret_saved.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
+ regs->rsp = iret_saved.rsp;
+ regs->ss = iret_saved.ss | 3; /* force guest privilege */
+
+ if ( !(iret_saved.flags & VGCF_IN_SYSCALL) )
+ {
+ regs->entry_vector = 0;
+ regs->r11 = iret_saved.r11;
+ regs->rcx = iret_saved.rcx;
+ }
+
+ /* No longer in NMI context. */
+ clear_bit(_VCPUF_nmi_masked, &current->vcpu_flags);
+
+ /* Saved %rax gets written back to regs->rax in entry.S. */
+ return iret_saved.rax;
+}
+
asmlinkage void syscall_enter(void);
void __init percpu_traps_init(void)
{