14 files changed, 119 insertions, 77 deletions
diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c
index 92fff33a6a..3acfc2173d 100644
--- a/tools/xc/lib/xc_linux_build.c
+++ b/tools/xc/lib/xc_linux_build.c
@@ -284,7 +284,9 @@ static int setup_guestos(int xc_handle,
     /* shared_info page starts its life empty. */
     shared_info = map_pfn_writeable(pm_handle, shared_info_frame);
     memset(shared_info, 0, PAGE_SIZE);
-    shared_info->evtchn_upcall_mask = ~0UL; /* mask all upcalls */
+    /* Mask all upcalls... */
+    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+        shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
     unmap_pfn(pm_handle, shared_info);
 
     /* Send the page update requests down to the hypervisor. */
diff --git a/tools/xc/lib/xc_netbsd_build.c b/tools/xc/lib/xc_netbsd_build.c
index ae7ebecc6e..a53018297c 100644
--- a/tools/xc/lib/xc_netbsd_build.c
+++ b/tools/xc/lib/xc_netbsd_build.c
@@ -75,7 +75,7 @@ static int setup_guestos(int xc_handle,
     shared_info_t *shared_info;
     unsigned long ksize;
     mmu_t *mmu = NULL;
-    int pm_handle;
+    int pm_handle, i;
 
     memset(builddomain, 0, sizeof(*builddomain));
 
@@ -183,7 +183,9 @@ static int setup_guestos(int xc_handle,
     /* shared_info page starts its life empty. */
     shared_info = map_pfn_writeable(pm_handle, shared_info_frame);
     memset(shared_info, 0, PAGE_SIZE);
-    shared_info->evtchn_upcall_mask = ~0UL; /* mask all upcalls */
+    /* Mask all upcalls... */
+    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+        shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
     unmap_pfn(pm_handle, shared_info);
 
     /* Send the page update requests down to the hypervisor. */
diff --git a/xen/arch/i386/entry.S b/xen/arch/i386/entry.S
index eab11e5ad8..0141991704 100644
--- a/xen/arch/i386/entry.S
+++ b/xen/arch/i386/entry.S
@@ -112,8 +112,8 @@ FAILSAFE_SEL    = 32
 FAILSAFE_ADDR   = 36
 
 /* Offsets in shared_info_t */
-UPCALL_PENDING  = 0
-UPCALL_MASK     = 4
+#define UPCALL_PENDING /* 0 */
+#define UPCALL_MASK       1
 
 /* Offsets in guest_trap_bounce */
 GTB_ERROR_CODE   =  0
@@ -368,12 +368,11 @@ test_all_events:
         jnz  process_hyp_events
 /*test_guest_events:*/
         movl SHARED_INFO(%ebx),%eax
-        movl UPCALL_MASK(%eax),%ecx
-        notl %ecx
-        andl UPCALL_PENDING(%eax),%ecx  # ECX = pending & ~mask
-        andl $1,%ecx                    # Is bit 0 pending and not masked?
+        testb $0xFF,UPCALL_MASK(%eax)
+        jnz  restore_all_guest
+        testb $0xFF,UPCALL_PENDING(%eax)
         jz   restore_all_guest
-        lock btsl $0,UPCALL_MASK(%eax)  # Upcalls are masked during delivery
+        movb $1,UPCALL_MASK(%eax)       # Upcalls are masked during delivery
 /*process_guest_events:*/
         movzwl PROCESSOR(%ebx),%edx
         shl  $4,%edx                    # sizeof(guest_trap_bounce) == 16
diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c
index d10292f618..d50b101f3a 100644
--- a/xen/arch/i386/traps.c
+++ b/xen/arch/i386/traps.c
@@ -206,7 +206,7 @@ static inline void do_trap(int trapnr, char *str,
     gtb->cs         = ti->cs;
     gtb->eip        = ti->address;
     if ( TI_GET_IF(ti) )
-        set_bit(0, &p->shared_info->evtchn_upcall_mask);
+        p->shared_info->vcpu_data[0].evtchn_upcall_mask = 1;
     return; 
 
  fault_in_hypervisor:
@@ -277,9 +277,7 @@ asmlinkage void do_int3(struct pt_regs *regs, long error_code)
     gtb->cs         = ti->cs;
     gtb->eip        = ti->address;
     if ( TI_GET_IF(ti) )
-        set_bit(0, &p->shared_info->evtchn_upcall_mask);
-    return;
-
+        p->shared_info->vcpu_data[0].evtchn_upcall_mask = 1;
 }
 
 asmlinkage void do_double_fault(void)
@@ -353,7 +351,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
     gtb->cs         = ti->cs;
     gtb->eip        = ti->address;
     if ( TI_GET_IF(ti) )
-        set_bit(0, &p->shared_info->evtchn_upcall_mask);
+        p->shared_info->vcpu_data[0].evtchn_upcall_mask = 1;
     return; 
 
  fault_in_hypervisor:
@@ -452,7 +450,7 @@ asmlinkage void do_general_protection(struct pt_regs *regs, long error_code)
     gtb->cs         = ti->cs;
     gtb->eip        = ti->address;
     if ( TI_GET_IF(ti) )
-        set_bit(0, &p->shared_info->evtchn_upcall_mask);
+        p->shared_info->vcpu_data[0].evtchn_upcall_mask = 1;
     return;
 
  gp_in_kernel:
diff --git a/xen/common/domain.c b/xen/common/domain.c
index f83562a903..e86a5eba27 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -918,7 +918,9 @@ int construct_dom0(struct task_struct *p,
     /* Set up shared-info area. */
     update_dom_time(p->shared_info);
     p->shared_info->domain_time = 0;
-    p->shared_info->evtchn_upcall_mask = ~0UL; /* mask all upcalls */
+    /* Mask all upcalls... */
+    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+        p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
 
     /* Install the new page tables. */
     __cli();
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 734df5cffa..2f6a38417c 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -94,8 +94,9 @@ void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs)
         sched_prn_state(p ->state);
 	printk(", hyp_events = %08x\n", p->hyp_events);
         s = p->shared_info; 
-        printk("Guest: upcall_pend = %08lx, upcall_mask = %08lx\n", 
-               s->evtchn_upcall_pending, s->evtchn_upcall_mask);
+        printk("Guest: upcall_pend = %02x, upcall_mask = %02x\n", 
+               s->vcpu_data[0].evtchn_upcall_pending, 
+               s->vcpu_data[0].evtchn_upcall_mask);
         printk("Notifying guest...\n"); 
         send_guest_virq(p, VIRQ_DEBUG);
     }
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 496b35b9a8..7e8d03dbc0 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -220,7 +220,7 @@ void wake_up(struct task_struct *p)
 static long do_block(void)
 {
     ASSERT(current->domain != IDLE_DOMAIN_ID);
-    clear_bit(0, &current->shared_info->evtchn_upcall_mask);
+    current->shared_info->vcpu_data[0].evtchn_upcall_mask = 0;
     current->state = TASK_INTERRUPTIBLE;
     TRACE_2D(TRC_SCHED_BLOCK, current->domain, current);
     __enter_scheduler();
diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h
index 2335ed5ad2..3941b34944 100644
--- a/xen/include/hypervisor-ifs/hypervisor-if.h
+++ b/xen/include/hypervisor-ifs/hypervisor-if.h
@@ -150,6 +150,9 @@ typedef struct
 /* Event channel endpoints per domain. */
 #define NR_EVENT_CHANNELS 1024
 
+/* No support for multi-processor guests. */
+#define MAX_VIRT_CPUS 1
+
 /*
  * Xen/guestos shared data -- pointer provided in start_info.
  * NB. We expect that this struct is smaller than a page.
@@ -157,13 +160,39 @@ typedef struct
 typedef struct shared_info_st
 {
     /*
-     * If bit 0 in evtchn_upcall_pending is transitioned 0->1, and bit 0 in 
-     * evtchn_upcall_mask is clear, then an asynchronous upcall is scheduled. 
-     * The upcall mask can be used to prevent unbounded reentrancy and stack 
-     * overflow (in this way, acts as a kind of interrupt-enable flag).
+     * Per-VCPU information goes here. This will be cleaned up more when Xen 
+     * actually supports multi-VCPU guests.
      */
-    unsigned long evtchn_upcall_pending;
-    unsigned long evtchn_upcall_mask;
+    struct {
+        /*
+         * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
+         * a pending notification for a particular VCPU. It is then cleared 
+         * by the guest OS /before/ checking for pending work, thus avoiding
+         * a set-and-check race. Note that the mask is only accessed by Xen
+         * on the CPU that is currently hosting the VCPU. This means that the
+         * pending and mask flags can be updated by the guest without special
+         * synchronisation (i.e., no need for the x86 LOCK prefix).
+         * This may seem suboptimal because if the pending flag is set by
+         * a different CPU then an IPI may be scheduled even when the mask
+         * is set. However, note:
+         *  1. The task of 'interrupt holdoff' is covered by the per-event-
+         *     channel mask bits. A 'noisy' event that is continually being
+         *     triggered can be masked at source at this very precise
+         *     granularity.
+         *  2. The main purpose of the per-VCPU mask is therefore to restrict
+         *     reentrant execution: whether for concurrency control, or to
+         *     prevent unbounded stack usage. Whatever the purpose, we expect
+         *     that the mask will be asserted only for short periods at a time,
+         *     and so the likelihood of a 'spurious' IPI is suitably small.
+         * The mask is read before making an event upcall to the guest: a
+         * non-zero mask therefore guarantees that the VCPU will not receive
+         * an upcall activation. The mask is cleared when the VCPU requests
+         * to block: this avoids wakeup-waiting races.
+         */
+        u8 evtchn_upcall_pending;
+        u8 evtchn_upcall_mask;
+        u8 pad0, pad1;
+    } vcpu_data[MAX_VIRT_CPUS];
 
     /*
      * A domain can have up to 1024 "event channels" on which it can send
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index 3dd4cf383e..542cd3c6ef 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -18,7 +18,7 @@
  */
 
 /* Schedule an asynchronous callback for the specified domain. */
-static inline void __guest_notify(struct task_struct *p)
+static inline void guest_schedule_to_run(struct task_struct *p)
 {
 #ifdef CONFIG_SMP
     unsigned long flags, cpu_mask;
@@ -41,23 +41,11 @@ static inline void __guest_notify(struct task_struct *p)
 #endif
 }
 
-static inline void guest_notify(struct task_struct *p)
-{
-    /*
-     * Upcall already pending or upcalls masked?
-     * NB. Suitably synchronised on x86:
-     *  We must set the pending bit before checking the mask, but this is
-     *  guaranteed to occur because test_and_set_bit() is an ordering barrier.
-     */
-    if ( !test_and_set_bit(0, &p->shared_info->evtchn_upcall_pending) &&
-         !test_bit(0, &p->shared_info->evtchn_upcall_mask) )
-        __guest_notify(p);
-}
-
-
 /*
  * EVENT-CHANNEL NOTIFICATIONS
- * NB. As in guest_notify, evtchn_set_* is suitably synchronised on x86.
+ * NB. On x86, the atomic bit operations also act as memory barriers. There
+ * is therefore sufficiently strict ordering for this architecture -- others
+ * may require explicit memory barriers.
  */
 
 static inline void evtchn_set_pending(struct task_struct *p, int port)
@@ -66,7 +54,11 @@ static inline void evtchn_set_pending(struct task_struct *p, int port)
     if ( !test_and_set_bit(port,    &s->evtchn_pending[0]) &&
          !test_bit        (port,    &s->evtchn_mask[0])    &&
          !test_and_set_bit(port>>5, &s->evtchn_pending_sel) )
-        guest_notify(p);
+    {
+        /* The VCPU pending flag must be set /after/ update to evtchn-pend. */
+        p->shared_info->vcpu_data[0].evtchn_upcall_pending = 1;
+        guest_schedule_to_run(p);
+    }
 }
 
 static inline void evtchn_set_exception(struct task_struct *p, int port)
@@ -103,7 +95,7 @@ static inline void send_guest_pirq(struct task_struct *p, int pirq)
 static inline void send_hyp_event(struct task_struct *p, int event)
 {
     if ( !test_and_set_bit(event, &p->hyp_events) )
-        __guest_notify(p);
+        guest_schedule_to_run(p);
 }
 
 /* Called on return from (architecture-dependent) entry.S. */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 4f506df04b..033f860c01 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -297,10 +297,10 @@ static inline long schedule_timeout(long timeout)
     return 0;
 }
 
-#define signal_pending(_p) \
-    (((_p)->hyp_events != 0) ||                                 \
-     (test_bit(0, &(_p)->shared_info->evtchn_upcall_pending) && \
-      !test_bit(0, &(_p)->shared_info->evtchn_upcall_mask)))
+#define signal_pending(_p)                                      \
+    ( (_p)->hyp_events ||                                       \
+      ((_p)->shared_info->vcpu_data[0].evtchn_upcall_pending && \
+       !(_p)->shared_info->vcpu_data[0].evtchn_upcall_mask) )
 
 void domain_init(void);
 
diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S b/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S
index c744f1bdcb..b78c74fd9c 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S
+++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S
@@ -80,7 +80,7 @@ processor	= 52
 
 /* Offsets into shared_info_t. */
 #define evtchn_upcall_pending /* 0 */
-#define evtchn_upcall_mask       4
+#define evtchn_upcall_mask       1
 
 ENOSYS = 38
 
@@ -210,14 +210,14 @@ ENTRY(system_call)
 	movl %eax,EAX(%esp)		# save the return value
 ENTRY(ret_from_sys_call)
         movl SYMBOL_NAME(HYPERVISOR_shared_info),%esi
-        lock btsl $0,evtchn_upcall_mask(%esi) # make tests atomic
+        movb $1,evtchn_upcall_mask(%esi) # make tests atomic
 ret_syscall_tests:
 	cmpl $0,need_resched(%ebx)
 	jne reschedule
 	cmpl $0,sigpending(%ebx)
 	je   safesti                    # ensure need_resched updates are seen
 signal_return:
-	lock btrl $0,evtchn_upcall_mask(%esi) # reenable event callbacks
+	movb $0,evtchn_upcall_mask(%esi) # reenable event callbacks
 	movl %esp,%eax
 	xorl %edx,%edx
 	call SYMBOL_NAME(do_signal)
@@ -254,9 +254,9 @@ ret_from_exception:
 
 	ALIGN
 reschedule:
-        lock btrl $0,evtchn_upcall_mask(%esi)  # reenable event callbacks
-	call SYMBOL_NAME(schedule)             # test
-	jmp ret_from_sys_call
+        movb $0,evtchn_upcall_mask(%esi)  # reenable event callbacks
+	call SYMBOL_NAME(schedule)        # test
+	jmp  ret_from_sys_call
 
 ENTRY(divide_error)
 	pushl $0		# no error code
@@ -317,12 +317,12 @@ ENTRY(hypervisor_callback)
         movb CS(%esp),%cl
 	test $2,%cl          # slow return to ring 2 or 3
 	jne  ret_syscall_tests
-safesti:lock btrl $0,evtchn_upcall_mask(%esi) # reenable event callbacks
+safesti:movb $0,evtchn_upcall_mask(%esi) # reenable event callbacks
 scrit:  /**** START OF CRITICAL REGION ****/
-        testb $1,evtchn_upcall_pending(%esi)
+        testb $0xFF,evtchn_upcall_pending(%esi)
         jnz  14f              # process more events if necessary...
         RESTORE_ALL
-14:     lock btsl $0,evtchn_upcall_mask(%esi)
+14:     movb $1,evtchn_upcall_mask(%esi)
         jmp  11b
 ecrit:  /**** END OF CRITICAL REGION ****/
 # [How we do the fixup]. We want to merge the current stack frame with the
@@ -351,7 +351,7 @@ critical_region_fixup:
         jmp  11b
         
 critical_fixup_table:        
-        .byte 0x00,0x00,0x00                  # testb $1,(%esi)
+        .byte 0x00,0x00,0x00                  # testb $0xFF,(%esi)
         .byte 0x00,0x00                       # jnz  14f
         .byte 0x00                            # pop  %ebx
         .byte 0x04                            # pop  %ecx
@@ -364,7 +364,7 @@ critical_fixup_table:
         .byte 0x20                            # pop  %es
         .byte 0x24,0x24,0x24                  # add  $4,%esp
         .byte 0x28                            # iret
-        .byte 0x00,0x00,0x00,0x00,0x00,0x00   # lock btsl $0,4(%esi)
+        .byte 0x00,0x00,0x00,0x00             # movb $1,4(%esi)
         .byte 0x00,0x00                       # jmp  11b
 
 # Hypervisor uses this for application faults while it executes.
diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/evtchn.c b/xenolinux-2.4.25-sparse/arch/xen/kernel/evtchn.c
index c65806a7d9..7425f92047 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/kernel/evtchn.c
+++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/evtchn.c
@@ -50,8 +50,10 @@ void evtchn_do_upcall(struct pt_regs *regs)
 
     local_irq_save(flags);
     
-    while ( synch_test_and_clear_bit(0, &s->evtchn_upcall_pending) )
+    while ( s->vcpu_data[0].evtchn_upcall_pending )
     {
+        s->vcpu_data[0].evtchn_upcall_pending = 0;
+        /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
         l1 = xchg(&s->evtchn_pending_sel, 0);
         while ( (l1i = ffs(l1)) != 0 )
         {
diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/evtchn.h b/xenolinux-2.4.25-sparse/include/asm-xen/evtchn.h
index 8dbb460cda..ececad9447 100644
--- a/xenolinux-2.4.25-sparse/include/asm-xen/evtchn.h
+++ b/xenolinux-2.4.25-sparse/include/asm-xen/evtchn.h
@@ -42,10 +42,12 @@ static inline void unmask_evtchn(int port)
      * a real IO-APIC we 'lose the interrupt edge' if the channel is masked.
      */
     if (  synch_test_bit        (port,    &s->evtchn_pending[0]) && 
-         !synch_test_and_set_bit(port>>5, &s->evtchn_pending_sel) &&
-         !synch_test_and_set_bit(0,       &s->evtchn_upcall_pending) &&
-         !synch_test_bit        (0,       &s->evtchn_upcall_mask) )
-        evtchn_do_upcall(NULL);
+         !synch_test_and_set_bit(port>>5, &s->evtchn_pending_sel) )
+    {
+        s->vcpu_data[0].evtchn_upcall_pending = 1;
+        if ( !s->vcpu_data[0].evtchn_upcall_mask )
+            evtchn_do_upcall(NULL);
+    }
 }
 
 static inline void clear_evtchn(int port)
diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/system.h b/xenolinux-2.4.25-sparse/include/asm-xen/system.h
index 77b325d61a..86d6c7b150 100644
--- a/xenolinux-2.4.25-sparse/include/asm-xen/system.h
+++ b/xenolinux-2.4.25-sparse/include/asm-xen/system.h
@@ -302,42 +302,55 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 
 #define safe_halt()             ((void)0)
 
-/*
- * Note the use of synch_*_bit() operations in the following. These operations
- * ensure correct serialisation of checks and updates w.r.t. Xen executing on
- * a different CPU.
+/* 
+ * The use of 'barrier' in the following reflects their use as local-lock
+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+ * critical operations are executed. All critical operatiosn must complete
+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+ * includes these barriers, for example.
  */
 
 #define __cli()                                                               \
 do {                                                                          \
-    synch_set_bit(0, &HYPERVISOR_shared_info->evtchn_upcall_mask);            \
+    HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1;              \
+    barrier();                                                                \
 } while (0)
 
 #define __sti()                                                               \
 do {                                                                          \
     shared_info_t *_shared = HYPERVISOR_shared_info;                          \
-    synch_clear_bit(0, &_shared->evtchn_upcall_mask);                         \
-    if ( unlikely(synch_test_bit(0, &_shared->evtchn_upcall_pending)) )       \
+    barrier();                                                                \
+    _shared->vcpu_data[0].evtchn_upcall_mask = 0;                             \
+    if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) )              \
         evtchn_do_upcall(NULL);                                               \
 } while (0)
 
 #define __save_flags(x)                                                       \
 do {                                                                          \
-    (x) = synch_test_bit(0, &HYPERVISOR_shared_info->evtchn_upcall_mask);     \
+    (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask;            \
 } while (0)
 
-#define __restore_flags(x) do { if (x) __cli(); else __sti(); } while (0)
+#define __restore_flags(x)                                                    \
+do {                                                                          \
+    shared_info_t *_shared = HYPERVISOR_shared_info;                          \
+    barrier();                                                                \
+    if ( (_shared->vcpu_data[0].evtchn_upcall_mask = x) == 0 )                \
+        if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) )          \
+            evtchn_do_upcall(NULL);                                           \
+} while (0)
 
 #define __save_and_cli(x)                                                     \
 do {                                                                          \
-    (x) = synch_test_and_set_bit(                                             \
-        0, &HYPERVISOR_shared_info->evtchn_upcall_mask);                      \
+    (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask;            \
+    HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1;              \
+    barrier();                                                                \
 } while (0)
 
 #define __save_and_sti(x)                                                     \
 do {                                                                          \
-    (x) = synch_test_and_clear_bit(                                           \
-        0, &HYPERVISOR_shared_info->evtchn_upcall_mask);                      \
+    barrier();                                                                \
+    (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask;            \
+    HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 0;              \
 } while (0)
 
 #define local_irq_save(x)       __save_and_cli(x)