17 files changed, 814 insertions, 340 deletions
diff --git a/.bk-to-hg b/.bk-to-hg
index a1e10bd62c..5bd381ab73 100755
--- a/.bk-to-hg
+++ b/.bk-to-hg
@@ -2,5 +2,7 @@
 set -e
 test -L old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs
 rm      old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs
+test -L tools/domain_builder/dom0_ops.h
+rm      tools/domain_builder/dom0_ops.h
 (find -depth -type d -print | xargs -r rmdir 2>/dev/null) || true
 exit 0
diff --git a/.hg-to-bk b/.hg-to-bk
index 817e56037e..dc30e75316 100755
--- a/.hg-to-bk
+++ b/.hg-to-bk
@@ -5,5 +5,8 @@ mkdir -p old/xenolinux-2.4.16-sparse
 mkdir -p old/xenolinux-2.4.16-sparse/include
 mkdir -p old/xenolinux-2.4.16-sparse/include/asm-xeno
 ln -s ../../../xen-2.4.16/include/hypervisor-ifs old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs
+mkdir -p tools
+mkdir -p tools/domain_builder
+ln -s ../../xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h tools/domain_builder/dom0_ops.h
 (find -depth -type d -print | xargs -r rmdir 2>/dev/null) || true
 exit 0
diff --git a/.rootkeys b/.rootkeys
index dc1f07ed27..cb721ac57b 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -182,7 +182,7 @@
 3e4d00468-FN2VDeEHo96zxrMHK_mA tools/domain_builder/Makefile
 3e4d0046SPau_y0sw2WLJz8QkqNoRA tools/domain_builder/README
 3e4d0046bbdH0GsI9J_1Eb4ZQHfIiQ tools/domain_builder/dom0_defs.h
-3e4d0046RgYCfGOw6qGz_7kYLMV2Vw tools/domain_builder/dom0_ops.h
+3e71f9b871pvOAxDrhxpC4N4mHkbww tools/domain_builder/dom0_ops.h
 3e4d0046ouLij_CMN_j7-dUHZIBI_A tools/domain_builder/dom_builder.c
 3e4d0046EKs06fY0CWDEgZQcn7DYUg tools/domain_builder/dom_kill.c
 3e4d0046aPbGiRTtdWxqY5b3ytWurA tools/domain_builder/hypervisor_defs.h
@@ -487,6 +487,7 @@
 3e5a4e65BXtftInNHUC2PjDfPhdZZA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_core.c
 3e5a4e65uXAx05p6B1-HU2tijuw8qA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_memory.c
 3e5a4e65EOOLlPwXnhSuX-iVdWLmnA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
+3e6dba59C8o0kBks7UZ4IW_FY853Aw xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c
 3e5a4e65gfn_ltB8ujHMVFApnTTNRQ xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/vfr.c
 3e5a4e65gZBRBB6RsSVg1c9iahigAw xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/Makefile
 3e5a4e65ZxKrbFetVB84JhrTyZ1YuQ xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c
diff --git a/tools/domain_builder/dom0_ops.h b/tools/domain_builder/dom0_ops.h
deleted file mode 100644
index 6c60a93ff6..0000000000
--- a/tools/domain_builder/dom0_ops.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/******************************************************************************
- * dom0_ops.h
- * 
- * Process command requests from domain-0 guest OS.
- * 
- * Copyright (c) 2002, K A Fraser, B Dragovic
- */
-
-#define DOM0_NEWDOMAIN   0
-#define DOM0_KILLDOMAIN  1
-#define DOM0_GETMEMLIST  2
-#define DOM0_STARTDOM    4
-#define MAP_DOM_MEM      6 /* Not passed down to Xen */
-#define DO_PGUPDATES     7 /* Not passed down to Xen */
-#define MAX_CMD          8
-
-#define MAX_CMD_LEN     256
-
-typedef struct dom0_newdomain_st
-{
-    unsigned int domain;
-    unsigned int memory_kb;
-    unsigned int num_vifs;  // temporary
-    unsigned long pg_head;  // return parameter
-} dom0_newdomain_t;
-
-typedef struct dom0_killdomain_st
-{
-    unsigned int domain;
-    int          force;
-} dom0_killdomain_t;
-
-typedef struct dom0_getmemlist_st
-{
-    unsigned long start_pfn;
-    unsigned long num_pfns;
-    void *buffer;
-} dom0_getmemlist_t;
-
-/* This is entirely processed by XenoLinux */
-typedef struct dom_mem 
-{
-    unsigned int domain;
-    unsigned long vaddr;
-    unsigned long start_pfn;
-    int tot_pages;
-} dom_mem_t;
-
-/* This is entirely processed by XenoLinux */
-typedef struct dom_pgupdate
-{
-    unsigned long pgt_update_arr;
-    unsigned long num_pgt_updates;
-} dom_pgupdate_t;
-
-typedef struct domain_launch
-{
-    unsigned int domain;
-    unsigned long l2_pgt_addr;
-    unsigned long virt_load_addr;
-    unsigned long virt_shinfo_addr;
-    unsigned long virt_startinfo_addr;
-    unsigned int num_vifs;
-    char cmd_line[MAX_CMD_LEN];
-} dom_meminfo_t;
-
-typedef struct dom0_op_st
-{
-    unsigned long cmd;
-    union
-    {
-        dom0_newdomain_t newdomain;
-        dom0_killdomain_t killdomain;
-        dom0_getmemlist_t getmemlist;
-        dom_mem_t dommem;
-        dom_pgupdate_t pgupdate;
-        dom_meminfo_t meminfo;
-    }
-    u;
-} dom0_op_t;
-
diff --git a/xen/arch/i386/apic.c b/xen/arch/i386/apic.c
index 865a279d8c..0acf7067c3 100644
--- a/xen/arch/i386/apic.c
+++ b/xen/arch/i386/apic.c
@@ -659,6 +659,13 @@ int reprogram_ac_timer(s_time_t timeout)
     s_time_t	expire;
     u64			apic_tmict;
 
+    if (timeout  == 0) {
+        /* XXX RN: not sure if this disables it or cause interruptto 
+         * go off imediately */
+        apic_tmict = 0;
+        goto reprogram;
+    }
+
     now = NOW();
     expire = timeout - now;	/* value from now */
 
@@ -680,6 +687,7 @@ int reprogram_ac_timer(s_time_t timeout)
         return 0;
     }
 
+ reprogram:
     /* programm timer */
     apic_write(APIC_TMICT, (unsigned long)apic_tmict);
 
diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c
index dc70de4e0c..73ac893e08 100644
--- a/xen/common/ac_timer.c
+++ b/xen/common/ac_timer.c
@@ -23,7 +23,6 @@
 #include <xeno/errno.h>
 #include <xeno/sched.h>
 #include <xeno/lib.h>
-#include <xeno/config.h>
 #include <xeno/smp.h>
 
 #include <xeno/perfc.h>
@@ -41,10 +40,10 @@
 #define TRC(_x)
 #endif
 
-/*
+/*****************************************************************************
  * We pull handlers off the timer list this far in future,
  * rather than reprogramming the time hardware.
- */
+ *****************************************************************************/
 #define TIMER_SLOP (50*1000) /* ns */
 
 /* A timer list per CPU */
@@ -58,30 +57,29 @@ static ac_timers_t ac_timers[NR_CPUS];
 
 /* local prototypes */
 static int  detach_ac_timer(struct ac_timer *timer);
-/*static void ac_timer_debug(unsigned long);*/
 
-/*
+
+/*****************************************************************************
  * add a timer.
  * return value:
  *  0: success
  *  1: failure, timer in the past or timeout value to small
  * -1: failure, timer uninitialised
  * fail
- */
+ *****************************************************************************/
 int add_ac_timer(struct ac_timer *timer)
 {
-    int 			 cpu = smp_processor_id();
-    unsigned long 	 flags;
-    s_time_t		 now;
+    int              cpu = smp_processor_id();
+    unsigned long    flags;
+    s_time_t         now;
 
     /* make sure timeout value is in the future */
+    
     now = NOW();
-    TRC(printk("ACT  [%02d] add(): now=%lld timo=%lld\n",
-               cpu, now, timer->expires));
-    if (timer->expires <= now) {	
-        printk("ACT[%02d] add_ac_timer: now=0x%08X%08X > expire=0x%08X%08X\n",
-               cpu, (u32)(now>>32), (u32)now,
-               (u32)(timer->expires>>32), (u32)timer->expires);
+    if (timer->expires <= now) {    
+        TRC(printk("ACT[%02d] add_ac_timer:now=0x%08X%08X>expire=0x%08X%08X\n",
+                   cpu, (u32)(now>>32), (u32)now,
+                   (u32)(timer->expires>>32), (u32)timer->expires));
         return 1;
     }
     spin_lock_irqsave(&ac_timers[cpu].lock, flags);
@@ -90,71 +88,57 @@ int add_ac_timer(struct ac_timer *timer)
      * reprogramm the timer
      */
     if (list_empty(&ac_timers[cpu].timers)) {
-        /* Reprogramm and add to head of list */
         if (!reprogram_ac_timer(timer->expires)) {
-            /* failed */
-            printk("ACT  [%02d] add(): add at head failed\n", cpu);
+            printk("ACT[%02d] add at head failed\n", cpu);
             spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
-            return 1;
+            return 1; /* failed */
         }
         list_add(&timer->timer_list, &ac_timers[cpu].timers);
-        TRC(printk("ACT  [%02d] add(0x%08X%08X): added at head\n", cpu,
-                   (u32)(timer->expires>>32), (u32)timer->expires));
     } else {
         struct list_head *pos;
-        struct ac_timer	 *t;
-        for (pos = ac_timers[cpu].timers.next;
-             pos != &ac_timers[cpu].timers;
-             pos = pos->next) {
+        struct ac_timer  *t;
+
+        list_for_each(pos, &ac_timers[cpu].timers) {
             t = list_entry(pos, struct ac_timer, timer_list);
             if (t->expires > timer->expires)
                 break;
         }
+        list_add (&(timer->timer_list), pos->prev);
 
-        if (pos->prev == &ac_timers[cpu].timers) {
-            /* added to head, reprogramm timer */
+        if (timer->timer_list.prev == &ac_timers[cpu].timers) {
+            /* added at head */
             if (!reprogram_ac_timer(timer->expires)) {
-                /* failed */
-                TRC(printk("ACT  [%02d] add(): add at head failed\n", cpu));
+                printk("ACT[%02d] add at head failed\n", cpu);
+                detach_ac_timer(timer);
                 spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
-                return 1;
+                return 1; /* failed */
             }
-            list_add (&(timer->timer_list), pos->prev);
-            TRC(printk("ACT  [%02d] add(0x%08X%08X): added at head\n", cpu,
-                       (u32)(timer->expires>>32), (u32)timer->expires));
-        } else {
-            list_add (&(timer->timer_list), pos->prev);
-            TRC(printk("ACT  [%02d] add(0x%08X%08X): add < exp=0x%08X%08X\n",
-                       cpu,
-                       (u32)(timer->expires>>32), (u32)timer->expires,
-                       (u32)(t->expires>>32), (u32)t->expires));
         }
     }
     spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
     return 0;
 }
 
-/*
- * remove a timer
+/*****************************************************************************
+ * detach a timer (no locking)
  * return values:
  *  0: success
  * -1: bogus timer
- */
+ *****************************************************************************/
 static int detach_ac_timer(struct ac_timer *timer)
 {  
-    TRC(int 			 cpu = smp_processor_id());
     TRC(printk("ACT  [%02d] detach(): \n", cpu));
     list_del(&timer->timer_list);
     timer->timer_list.next = NULL;
     return 0;
 }
 
-/*
+/*****************************************************************************
  * remove a timer
  * return values:
  *  0: success
  * -1: bogus timer
- */
+ *****************************************************************************/
 int rem_ac_timer(struct ac_timer *timer)
 {
     int           cpu = smp_processor_id();
@@ -163,19 +147,30 @@ int rem_ac_timer(struct ac_timer *timer)
 
     TRC(printk("ACT  [%02d] remove(): timo=%lld \n", cpu, timer->expires));
     spin_lock_irqsave(&ac_timers[cpu].lock, flags);
-    if (timer->timer_list.next)
-        res = detach_ac_timer(timer);   
+    if (timer->timer_list.next) {
+        res = detach_ac_timer(timer);
+
+        if (timer->timer_list.prev == &ac_timers[cpu].timers) {
+            /* just removed the head */
+            if (list_empty(&ac_timers[cpu].timers)) {
+                reprogram_ac_timer((s_time_t) 0);
+            }
+            /* XXX should actaully reprogramm APIC to new head */
+        }
+    } else
+        res = -1;
+
     spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
 
     return res;
 }
 
-/*
+/*****************************************************************************
  * modify a timer, i.e., set a new timeout value
  * return value:
  *  0: sucess
  * -1: error
- */
+ *****************************************************************************/
 int mod_ac_timer(struct ac_timer *timer, s_time_t new_time)
 {
     if (rem_ac_timer(timer) != 0)
@@ -186,10 +181,10 @@ int mod_ac_timer(struct ac_timer *timer, s_time_t new_time)
     return 0;
 }
 
-/*
+/*****************************************************************************
  * do_ac_timer
  * deal with timeouts and run the handlers
- */
+ *****************************************************************************/
 void do_ac_timer(void)
 {
     int              cpu = smp_processor_id();
@@ -206,15 +201,21 @@ void do_ac_timer(void)
         
     /* Sanity: is the timer list empty? */
     if ( list_empty(&ac_timers[cpu].timers) ) {
-        printk("ACT[%02d] do_ac_timer(): timer irq without timer\n", cpu);
+        /*
+         * XXX RN: This shouldn't happen, but does! Two possibilities:
+         * - Race condition between removing and reseting APIC
+         * - setting an APIC timeout value of 0 causes an immediate
+         *   timer interrupt to fire.
+         * None of these should be critical!
+         */
+        spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
         return;
     }
 
     /* Handle all timeouts in the near future. */
     while ( !list_empty(&ac_timers[cpu].timers) )
     {
-        t = list_entry(ac_timers[cpu].timers.next, 
-                       struct ac_timer, timer_list);
+        t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list);
         if ( t->expires > (NOW() + TIMER_SLOP) ) break;
 
         /* do some stats */
@@ -232,8 +233,7 @@ void do_ac_timer(void)
     /* If list not empty then reprogram timer to new head of list */
     if ( !list_empty(&ac_timers[cpu].timers) )
     {
-        t = list_entry(ac_timers[cpu].timers.next, 
-                       struct ac_timer, timer_list);
+        t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list);
         if ( t->expires > 0 )
         {
             TRC(printk("ACT  [%02d] do(): reprog timo=%lld\n",cpu,t->expires));
@@ -243,16 +243,18 @@ void do_ac_timer(void)
                 goto do_timer_again;
             }
         }
+    } else {
+        reprogram_ac_timer((s_time_t) 0);
     }
 
     spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
     TRC(printk("ACT  [%02d] do(): end\n", cpu));
 }
 
-/*
+/*****************************************************************************
  * debug dump_queue
  * arguments: queue head, name of queue
- */
+ *****************************************************************************/
 static void dump_tqueue(struct list_head *queue, char *name)
 {
     struct list_head *list;
@@ -272,7 +274,6 @@ static void dump_tqueue(struct list_head *queue, char *name)
     return; 
 }
 
-
 void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs)
 {
     u_long   flags; 
@@ -304,3 +305,50 @@ void __init ac_timer_init(void)
         spin_lock_init(&ac_timers[i].lock);
     }
 }
+
+/*****************************************************************************
+ * GRAVEYARD
+ *****************************************************************************/
+
+#if 0
+
+#ifdef AC_TIMER_STATS
+#define BUCKETS     1000
+#define MAX_STATS
+typedef struct act_stats_st
+{
+    u32 count;
+    u32 times[2*(BUCKETS)];
+} __cacheline_aligned act_stats_t;
+static act_stats_t act_stats[NR_CPUS];
+
+#endif
+
+#ifdef AC_TIMER_STATS
+    {
+        XXX this is at the wrong place
+        s32 diff;
+        u32 i;
+        diff = ((s32)(NOW() - t->expires)) / 1000; /* delta in us */
+        if (diff < -BUCKETS)
+            diff = -BUCKETS;
+        else if (diff > BUCKETS)
+            diff = BUCKETS;
+        act_stats[cpu].times[diff+BUCKETS]++;
+        act_stats[cpu].count++;
+
+        if (act_stats[cpu].count >= 5000) {
+            printk("ACT Stats\n");
+            for (i=0; i < 2*BUCKETS; i++) {
+                if (act_stats[cpu].times[i] != 0)
+                    printk("ACT [%02d]: %3dus: %5d\n",
+                           cpu,i-BUCKETS, act_stats[cpu].times[i]);
+                act_stats[cpu].times[i]=0;
+            }
+            act_stats[cpu].count = 0;
+            printk("\n");
+        }
+    }
+#endif
+
+#endif /* 0 */
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c
index e451a8f3e7..e6d54e9695 100644
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -126,6 +126,34 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
     }
     break;
 
+    case DOM0_BVTCTL:
+    {
+        unsigned long  ctx_allow = op.u.bvtctl.ctx_allow;
+        ret = sched_bvtctl(ctx_allow);
+        
+    }
+    break;
+
+    case DOM0_ADJUSTDOM:
+    {
+        unsigned int   dom     = op.u.adjustdom.domain;
+        unsigned long  mcu_adv = op.u.adjustdom.mcu_adv;
+        unsigned long  warp    = op.u.adjustdom.warp;
+        unsigned long  warpl   = op.u.adjustdom.warpl;
+        unsigned long  warpu   = op.u.adjustdom.warpu;
+        
+
+        if ( dom == IDLE_DOMAIN_ID )
+        {
+            ret = -EPERM;
+        }
+        else
+        {
+            ret = sched_adjdom(dom, mcu_adv, warp, warpl, warpu);
+        }
+    }
+    break;
+
     case DOM0_GETMEMLIST:
     {
         int i;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 32bf8b7172..5fc4304c01 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -58,10 +58,11 @@ struct task_struct *do_newdomain(unsigned int dom_id, unsigned int cpu)
     SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
 
     p->addr_limit = USER_DS;
-    p->state      = TASK_UNINTERRUPTIBLE;
     p->active_mm  = &p->mm;
     p->num_net_vifs = 0;
 
+    sched_add_domain(p);
+
     p->net_ring_base = (net_ring_t *)(p->shared_info + 1);
     INIT_LIST_HEAD(&p->pg_head);
     p->max_pages = p->tot_pages = 0;
@@ -115,7 +116,8 @@ void kill_domain(void)
     }
 
     printk("Killing domain %d\n", current->domain);
-    current->state = TASK_DYING;
+    
+    sched_rem_domain(current);
     schedule();
     BUG(); /* never get here */
 }
@@ -293,7 +295,7 @@ int final_setup_guestos(struct task_struct * p, dom_meminfo_t * meminfo)
 
     /* set up the shared info structure */
     update_dom_time(p->shared_info);
-    p->shared_info->cpu_freq	 = cpu_freq;
+    p->shared_info->cpu_freq     = cpu_freq;
     p->shared_info->domain_time  = 0;
 
     /* we pass start info struct to guest os as function parameter on stack */
@@ -516,8 +518,8 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params)
     unmap_domain_mem(l1start);
 
     /* Set up shared info area. */
-	update_dom_time(p->shared_info);
-	p->shared_info->cpu_freq	 = cpu_freq;
+    update_dom_time(p->shared_info);
+    p->shared_info->cpu_freq     = cpu_freq;
     p->shared_info->domain_time  = 0;
 
 
@@ -555,7 +557,7 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params)
 #define SHIP2GUEST(_x) (virt_shinfo_address | (((unsigned long)(_x)) & 0xFFF))
 
     virt_startinfo_address->net_rings = 
-	(net_ring_t *)SHIP2GUEST(p->net_ring_base); 
+    (net_ring_t *)SHIP2GUEST(p->net_ring_base); 
     virt_startinfo_address->num_net_rings = p->num_net_vifs;
 
     /* Add block io interface */
@@ -597,7 +599,5 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params)
 
 void __init domain_init(void)
 {
-	printk("Initialising domains\n");
+    printk("Initialising domains\n");
 }
-
-
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 19943fff3e..dde9e0ff10 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -1,9 +1,6 @@
 #include <xeno/keyhandler.h> 
 #include <xeno/reboot.h>
 
-extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs);
-extern void perfc_reset (u_char key, void *dev_id, struct pt_regs *regs);
-
 #define KEY_MAX 256
 #define STR_MAX  64
 
@@ -117,6 +114,12 @@ void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs)
 }
 
 
+extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs);
+extern void perfc_reset (u_char key, void *dev_id, struct pt_regs *regs);
+extern void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs);
+extern void dump_runq(u_char key, void *dev_id, struct pt_regs *regs);
+
+
 void initialize_keytable() 
 {
     int i; 
@@ -126,13 +129,15 @@ void initialize_keytable()
 	key_table[i].handler = (key_handler *)NULL; 
 	
     /* setup own handlers */
+    add_key_handler('a', dump_timerq,    "dump ac_timer queues");
     add_key_handler('d', dump_registers, "dump registers"); 
     add_key_handler('h', show_handlers, "show this message");
     add_key_handler('p', perfc_printall, "print performance counters"); 
     add_key_handler('P', perfc_reset,    "reset performance counters"); 
     add_key_handler('q', do_task_queues, "dump task queues + guest state");
-    add_key_handler('B', kill_dom0, "reboot machine gracefully"); 
-    add_key_handler('R', halt_machine, "reboot machine ungracefully"); 
+    add_key_handler('r', dump_runq,      "dump run queues");
+    add_key_handler('B', kill_dom0,      "reboot machine gracefully"); 
+    add_key_handler('R', halt_machine,   "reboot machine ungracefully"); 
     
     return; 
 }
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 787b43d900..ce46069167 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -11,7 +11,8 @@
  * 
  * Environment: Xen Hypervisor
  * Description: CPU scheduling
- *				partially moved from domain.c
+ *              implements A Borrowed Virtual Time scheduler.
+ *              (see Duda & Cheriton SOSP'99)
  *
  ****************************************************************************
  * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
@@ -28,6 +29,9 @@
 #include <xeno/ac_timer.h>
 #include <xeno/interrupt.h>
 
+#include <xeno/perfc.h>
+
+
 #undef SCHEDULER_TRACE
 #ifdef SCHEDULER_TRACE
 #define TRC(_x) _x
@@ -35,80 +39,106 @@
 #define TRC(_x)
 #endif
 
-/*
+
+#define MCU         (s32)MICROSECS(100)     /* Minimum unit */
+static s32 ctx_allow=(s32)MILLISECS(10);    /* context switch allowance */
+
+/*****************************************************************************
  * per CPU data for the scheduler.
- */
+ *****************************************************************************/
 typedef struct schedule_data_st
 {
-    spinlock_t lock;
-    struct list_head runqueue;
-    struct task_struct *prev, *curr;
+    spinlock_t          lock;           /* lock for protecting this */
+    struct list_head    runqueue;       /* runqueue */
+    struct task_struct *prev, *curr;    /* previous and current task */
+    struct task_struct *idle;           /* idle task for this cpu */
+    u32                 svt;            /* system virtual time. per CPU??? */
+    struct ac_timer     s_timer;        /* scheduling timer  */
+
 } __cacheline_aligned schedule_data_t;
 schedule_data_t schedule_data[NR_CPUS];
 
-static __cacheline_aligned struct ac_timer s_timer[NR_CPUS];
+struct ac_timer     v_timer;        /* scheduling timer  */
+static void virt_timer(unsigned long foo);
 
-/*
- * Some convenience functions
- */
 
-static inline void __add_to_runqueue(struct task_struct * p)
+/*****************************************************************************
+ * Some convenience functions
+ *****************************************************************************/
+/* add a task to the head of the runqueue */
+static inline void __add_to_runqueue_head(struct task_struct * p)
 {
+    
     list_add(&p->run_list, &schedule_data[p->processor].runqueue);
 }
-
-static inline void __move_last_runqueue(struct task_struct * p)
+/* add a task to the tail of the runqueue */
+static inline void __add_to_runqueue_tail(struct task_struct * p)
 {
-    list_del(&p->run_list);
     list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
 }
 
-static inline void __move_first_runqueue(struct task_struct * p)
-{
-    list_del(&p->run_list);
-    list_add(&p->run_list, &schedule_data[p->processor].runqueue);
-}
-
+/* remove a task from runqueue  */
 static inline void __del_from_runqueue(struct task_struct * p)
 {
     list_del(&p->run_list);
     p->run_list.next = NULL;
 }
-
+/* is task on run queue?  */
 static inline int __task_on_runqueue(struct task_struct *p)
 {
     return (p->run_list.next != NULL);
 }
 
+#define next_domain(p) \\
+        list_entry((p)->run_list.next, struct task_struct, run_list)
 
-/*
- * Add a new domain to the scheduler
- */
+/******************************************************************************
+* Add and remove a domain
+******************************************************************************/
 void sched_add_domain(struct task_struct *p) 
 {
-    p->state      = TASK_UNINTERRUPTIBLE;
+    p->state       = TASK_UNINTERRUPTIBLE;
+    p->mcu_advance = 10;
+
+    if (p->domain == IDLE_DOMAIN_ID) {
+        p->avt = 0xffffffff;
+        p->evt = 0xffffffff;
+        schedule_data[p->processor].idle = p;
+    } else {
+        /* set avt end evt to system virtual time */
+        p->avt         = schedule_data[p->processor].svt;
+        p->evt         = schedule_data[p->processor].svt;
+        /* RN: XXX BVT fill in other bits */
+    }
 }
 
-/*
- * Remove domain to the scheduler
- */
 void sched_rem_domain(struct task_struct *p) 
 {
     p->state = TASK_DYING;
 }
 
 
-/*
+/****************************************************************************
  * wake up a domain which had been sleeping
- */
+ ****************************************************************************/
 int wake_up(struct task_struct *p)
 {
     unsigned long flags;
     int ret = 0;
+
     spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
+
     if ( __task_on_runqueue(p) ) goto out;
+
     p->state = TASK_RUNNING;
-    __add_to_runqueue(p);
+    __add_to_runqueue_head(p);
+
+    /* set the BVT parameters */
+    if (p->avt < schedule_data[p->processor].svt)
+        p->avt = schedule_data[p->processor].svt;
+
+    p->evt = p->avt; /* RN: XXX BVT deal with warping here */
+
     ret = 1;
 
  out:
@@ -116,75 +146,57 @@ int wake_up(struct task_struct *p)
     return ret;
 }
 
-static void process_timeout(unsigned long __data)
+/****************************************************************************
+ * Domain requested scheduling operations
+ ****************************************************************************/
+long do_sched_op(void)
 {
-    struct task_struct * p = (struct task_struct *) __data;
-    wake_up(p);
+    /* XXX implement proper */
+    current->state = TASK_INTERRUPTIBLE;
+    schedule();
+    return 0;
 }
 
-long schedule_timeout(long timeout)
+/****************************************************************************
+ * Control the scheduler
+ ****************************************************************************/
+long sched_bvtctl(unsigned long c_allow)
 {
-    struct timer_list timer;
-    unsigned long expire;
-    
-    switch (timeout)
-    {
-    case MAX_SCHEDULE_TIMEOUT:
-        /*
-         * These two special cases are useful to be comfortable in the caller.
-         * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
-         * negative value but I' d like to return a valid offset (>=0) to allow
-         * the caller to do everything it want with the retval.
-         */
-        schedule();
-        goto out;
-    default:
-        /*
-         * Another bit of PARANOID. Note that the retval will be 0 since no
-         * piece of kernel is supposed to do a check for a negative retval of
-         * schedule_timeout() (since it should never happens anyway). You just
-         * have the printk() that will tell you if something is gone wrong and
-         * where.
-         */
-        if (timeout < 0)
-        {
-            printk(KERN_ERR "schedule_timeout: wrong timeout "
-                   "value %lx from %p\n", timeout,
-                   __builtin_return_address(0));
-            current->state = TASK_RUNNING;
-            goto out;
-        }
-    }
-    
-    expire = timeout + jiffies;
-    
-    init_timer(&timer);
-    timer.expires = expire;
-    timer.data = (unsigned long) current;
-    timer.function = process_timeout;
-    
-    add_timer(&timer);
-    schedule();
-    del_timer_sync(&timer);
-    
-    timeout = expire - jiffies;
-    
- out:
-    return timeout < 0 ? 0 : timeout;
+    printk("sched: bvtctl %lu\n", c_allow);
+    ctx_allow = c_allow;
+    return 0;
 }
 
-/* RN: XXX turn this into do_halt() */
-/*
- * yield the current process
- */
-long do_sched_op(void)
+/****************************************************************************
+ * Adjust scheduling parameter for a given domain
+ ****************************************************************************/
+long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, 
+                 unsigned long warpl, unsigned long warpu)
 {
-    current->state = TASK_INTERRUPTIBLE;
-    schedule();
+    struct task_struct *p;
+
+    printk("sched: adjdom %02d %lu %lu %lu %lu\n",
+           dom, mcu_adv, warp, warpl, warpu);
+
+    p = find_domain_by_id(dom);
+    if ( p == NULL ) return -ESRCH;
+
+    spin_lock_irq(&schedule_data[p->processor].lock);   
+
+    p->mcu_advance = mcu_adv;
+
+    spin_unlock_irq(&schedule_data[p->processor].lock); 
+
     return 0;
 }
 
-
+/****************************************************************************
+ * cause a run through the scheduler when appropriate
+ * Appropriate is:
+ * - current task is idle task
+ * - new processes evt is lower than current one
+ * - the current task already ran for it's context switch allowance
+ ****************************************************************************/
 void reschedule(struct task_struct *p)
 {
     int cpu = p->processor;
@@ -192,16 +204,20 @@ void reschedule(struct task_struct *p)
     unsigned long flags;
 
     if (p->has_cpu)
-		return;
+        return;
 
     spin_lock_irqsave(&schedule_data[cpu].lock, flags);
     curr = schedule_data[cpu].curr;
-    if (is_idle_task(curr)) {
+
+    if ( is_idle_task(curr) ||
+         (p->evt < curr->evt) ||
+         (curr->lastschd + ctx_allow >= NOW()) ) {
+        /* reschedule */
         set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
         spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
 #ifdef CONFIG_SMP
         if (cpu != smp_processor_id())
-			smp_send_event_check_cpu(cpu);
+            smp_send_event_check_cpu(cpu);
 #endif
     } else {
         spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
@@ -209,47 +225,154 @@ void reschedule(struct task_struct *p)
 }
 
 
-/*
- * Pick the next domain to run
- */
-
+/**************************************************************************** 
+ * The main function
+ * - deschedule the current domain.
+ * - pick a new domain.
+ *   i.e., the domain with lowest EVT.
+ *   The runqueue should be ordered by EVT so that is easy.
+ ****************************************************************************/
 asmlinkage void schedule(void)
 {
-    struct task_struct *prev, *next, *p;
-    struct list_head *tmp;
-    int this_cpu;
-
+    struct task_struct *prev, *next, *next_prime, *p;
+    struct list_head   *tmp;
+    int                 this_cpu;
+    s_time_t            now;
+    s32                 r_time;     /* time for new dom to run */
+    s32                 ranfor;     /* assume we never run longer than 2.1s! */
+    s32                 mcus;
+    u32                 next_evt, next_prime_evt, min_avt;
+
+    perfc_incrc(sched_run1);
  need_resched_back:
+    perfc_incrc(sched_run2);
+
+    now = NOW();
+    next = NULL;
     prev = current;
     this_cpu = prev->processor;
 
+    /* remove timer  */
+    rem_ac_timer(&schedule_data[this_cpu].s_timer);
+
+    /*
+     * deschedule the current domain
+     */
+
     spin_lock_irq(&schedule_data[this_cpu].lock);
 
     ASSERT(!in_interrupt());
     ASSERT(__task_on_runqueue(prev));
 
-	__move_last_runqueue(prev);
+    if (is_idle_task(prev)) 
+        goto deschedule_done;
 
-    switch ( prev->state )
-    {
+    /* do some accounting */
+    ranfor = (s32)(now - prev->lastschd);
+    ASSERT((ranfor>0));
+    prev->cpu_time += ranfor;
+    
+    /* calculate mcu and update avt */
+    mcus = ranfor/MCU;
+    if (ranfor % MCU) mcus ++;  /* always round up */
+    prev->avt += mcus * prev->mcu_advance;
+    prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */
+
+    /* dequeue */
+    __del_from_runqueue(prev);
+    switch (prev->state) {
     case TASK_INTERRUPTIBLE:
-        if ( signal_pending(prev) )
-        {
-            prev->state = TASK_RUNNING;
+        if (signal_pending(prev)) {
+            prev->state = TASK_RUNNING; /* but has events pending */
             break;
         }
+    case TASK_UNINTERRUPTIBLE:
+    case TASK_WAIT:
+    case TASK_DYING:
     default:
-        __del_from_runqueue(prev);
+        /* done if not running. Else, continue */
+        goto deschedule_done;
     case TASK_RUNNING:;
     }
+
+    /* requeue */
+    __add_to_runqueue_tail(prev);
+    
+
+ deschedule_done:
     clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
 
-    next = NULL;
-    list_for_each(tmp, &schedule_data[smp_processor_id()].runqueue) {
+    /*
+     * Pick a new domain
+     */
+
+    /* we should at least have the idle task */
+    ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
+
+    /*
+     * scan through the run queue and pick the task with the lowest evt
+     * *and* the task the second lowest evt.
+     * this code is O(n) but we expect n to be small.
+     */
+    next       = schedule_data[this_cpu].idle;
+    next_prime = NULL;
+
+    next_evt       = 0xffffffff;
+    next_prime_evt = 0xffffffff;
+    min_avt        = 0xffffffff;    /* to calculate svt */
+
+
+    list_for_each(tmp, &schedule_data[this_cpu].runqueue) {
         p = list_entry(tmp, struct task_struct, run_list);
-        next = p;
-        if ( !is_idle_task(next) ) break;
+        if (p->evt < next_evt) {
+            next_prime     = next;
+            next_prime_evt = next_evt;
+            next = p;
+            next_evt = p->evt;
+        } else if (next_prime_evt == 0xffffffff) {
+            next_prime_evt = p->evt;
+            next_prime     = p;
+        } else if (p->evt < next_prime_evt) {
+            next_prime_evt = p->evt;
+            next_prime     = p;
+        }
+        /* determine system virtual time */
+        if (p->avt < min_avt)
+            min_avt = p->avt;
     }
+    ASSERT(next != NULL);   /* we should have at least the idle task */
+
+    /* update system virtual time  */
+    if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt;
+
+    if (is_idle_task(next)) {
+        r_time = ctx_allow;
+        goto sched_done;
+    }
+
+    if (next_prime == NULL || is_idle_task(next_prime)) {
+        /* we have only one runable task besides the idle task */
+        r_time = 10 * ctx_allow;     /* RN: random constant */
+        goto sched_done;
+    }
+
+    /*
+     * if we are here we have two runable tasks.
+     * work out how long 'next' can run till its evt is greater than
+     * 'next_prime's evt. Taking context switch allowance into account.
+     */
+    ASSERT(next_prime->evt > next->evt);
+    r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
+
+ sched_done:
+    ASSERT(r_time != 0);
+    ASSERT(r_time > ctx_allow);
+
+    if ( (r_time==0) || (r_time < ctx_allow)) {
+        printk("[%02d]: %lx\n", this_cpu, r_time);
+        dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
+    }
+
 
     prev->has_cpu = 0;
     next->has_cpu = 1;
@@ -257,6 +380,17 @@ asmlinkage void schedule(void)
     schedule_data[this_cpu].prev = prev;
     schedule_data[this_cpu].curr = next;
 
+    next->lastschd = now;
+
+    /* reprogramm the timer */
+ timer_redo:
+    schedule_data[this_cpu].s_timer.expires  = now + r_time;
+    if (add_ac_timer(&schedule_data[this_cpu].s_timer) == 1) {
+        printk("SCHED[%02d]: Shit this shouldn't happen\n", this_cpu);
+        now = NOW();
+        goto timer_redo;
+    }
+
     spin_unlock_irq(&schedule_data[this_cpu].lock);
 
     if ( unlikely(prev == next) )
@@ -266,6 +400,8 @@ asmlinkage void schedule(void)
         goto same_process;
     }
 
+    perfc_incrc(sched_ctx);
+
     prepare_to_switch();
     switch_to(prev, next);
     prev = schedule_data[this_cpu].prev;
@@ -274,67 +410,56 @@ asmlinkage void schedule(void)
     if ( prev->state == TASK_DYING ) release_task(prev);
 
  same_process:
+    /* update the domains notion of time  */
     update_dom_time(current->shared_info);
 
-    if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) )
+    if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) ) {
         goto need_resched_back;
+    }
     return;
 }
 
 /*
- * The scheduling timer.
+ * The scheduler timer.
  */
-static __cacheline_aligned int count[NR_CPUS];
 static void sched_timer(unsigned long foo)
 {
-    int 				cpu  = smp_processor_id();
+    int                 cpu  = smp_processor_id();
     struct task_struct *curr = schedule_data[cpu].curr;
-    s_time_t			now;
-    int 				res;
-
-    /* reschedule after each 5 ticks */
-    if (count[cpu] >= 5) {
-        set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
-        count[cpu] = 0;
-    }
-    count[cpu]++;
+    /* cause a reschedule */
+    set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
+    perfc_incrc(sched_irq);
+}
 
-    /*
-     * deliver virtual timer interrups to domains if we are CPU 0 XXX RN: We
-     * don't have a per CPU list of domains yet. Otherwise would use that.
-     * Plus, this should be removed anyway once Domains "know" about virtual
-     * time and timeouts. But, it's better here then where it was before.
-     */
-    if (cpu == 0) {
-        struct task_struct *p;
-        unsigned long cpu_mask = 0;
-
-        /* send virtual timer interrupt */
-        read_lock(&tasklist_lock);
-        p = &idle0_task;
-        do {
-            if ( is_idle_task(p) ) continue;
-            cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
-        }
-        while ( (p = p->next_task) != &idle0_task );
-        read_unlock(&tasklist_lock);
-        guest_event_notify(cpu_mask);
+/*
+ * The Domain virtual time timer
+ */
+static void virt_timer(unsigned long foo)
+{
+    unsigned long cpu_mask = 0;
+    struct task_struct *p;
+    s_time_t now;
+    int res;
+
+    /* send virtual timer interrupt */
+    read_lock(&tasklist_lock);
+    p = &idle0_task;
+    do {
+        if ( is_idle_task(p) ) continue;
+        cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
     }
+    while ( (p = p->next_task) != &idle0_task );
+    read_unlock(&tasklist_lock);
+    guest_event_notify(cpu_mask);
 
- again:
+    again:
     now = NOW();
-    s_timer[cpu].expires  = now + MILLISECS(10);
-    res=add_ac_timer(&s_timer[cpu]);
-
-    TRC(printk("SCHED[%02d] timer(): now=0x%08X%08X timo=0x%08X%08X\n",
-               cpu, (u32)(now>>32), (u32)now,
-               (u32)(s_timer[cpu].expires>>32), (u32)s_timer[cpu].expires));
+    v_timer.expires  = now + MILLISECS(10);
+    res=add_ac_timer(&v_timer);
     if (res==1)
         goto again;
-
 }
 
-
 /*
  * Initialise the data structures
  */
@@ -350,11 +475,15 @@ void __init scheduler_init(void)
         spin_lock_init(&schedule_data[i].lock);
         schedule_data[i].prev = &idle0_task;
         schedule_data[i].curr = &idle0_task;
-		
+        
         /* a timer for each CPU  */
-        init_ac_timer(&s_timer[i]);
-        s_timer[i].function = &sched_timer;
+        init_ac_timer(&schedule_data[i].s_timer);
+        schedule_data[i].s_timer.function = &sched_timer;
+
     }
+    schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */
+    init_ac_timer(&v_timer);
+    v_timer.function = &virt_timer;
 }
 
 /*
@@ -362,10 +491,121 @@ void __init scheduler_init(void)
  * This has to be done *after* the timers, e.g., APICs, have been initialised
  */
 void schedulers_start(void) 
-{	
+{   
     printk("Start schedulers\n");
     __cli();
     sched_timer(0);
+    virt_timer(0);
     smp_call_function((void *)sched_timer, NULL, 1, 1);
     __sti();
 }
+
+
+/****************************************************************************
+ * Functions for legacy support. 
+ * Schedule timeout is used at a number of places and is a bit meaningless 
+ * in the context of Xen, as Domains are not able to call these and all 
+ * there entry points into Xen should be asynchronous. If a domain wishes
+ * to block for a while it should use Xen's sched_op entry point.
+ ****************************************************************************/
+
+static void process_timeout(unsigned long __data)
+{
+    struct task_struct * p = (struct task_struct *) __data;
+    wake_up(p);
+}
+
+long schedule_timeout(long timeout)
+{
+    struct timer_list timer;
+    unsigned long expire;
+    
+    switch (timeout)
+    {
+    case MAX_SCHEDULE_TIMEOUT:
+        /*
+         * These two special cases are useful to be comfortable in the caller.
+         * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
+         * negative value but I' d like to return a valid offset (>=0) to allow
+         * the caller to do everything it want with the retval.
+         */
+        schedule();
+        goto out;
+    default:
+        /*
+         * Another bit of PARANOID. Note that the retval will be 0 since no
+         * piece of kernel is supposed to do a check for a negative retval of
+         * schedule_timeout() (since it should never happens anyway). You just
+         * have the printk() that will tell you if something is gone wrong and
+         * where.
+         */
+        if (timeout < 0)
+        {
+            printk(KERN_ERR "schedule_timeout: wrong timeout "
+                   "value %lx from %p\n", timeout,
+                   __builtin_return_address(0));
+            current->state = TASK_RUNNING;
+            goto out;
+        }
+    }
+    
+    expire = timeout + jiffies;
+    
+    init_timer(&timer);
+    timer.expires = expire;
+    timer.data = (unsigned long) current;
+    timer.function = process_timeout;
+    
+    add_timer(&timer);
+    schedule();
+    del_timer_sync(&timer);
+    
+    timeout = expire - jiffies;
+    
+ out:
+    return timeout < 0 ? 0 : timeout;
+}
+
+/****************************************************************************
+ * debug function
+ ****************************************************************************/
+
+static void dump_rqueue(struct list_head *queue, char *name)
+{
+    struct list_head *list;
+    int loop = 0;
+    struct task_struct  *p;
+
+    printk ("QUEUE %s %lx   n: %lx, p: %lx\n", name,  (unsigned long)queue,
+            (unsigned long) queue->next, (unsigned long) queue->prev);
+    list_for_each (list, queue) {
+        p = list_entry(list, struct task_struct, run_list);
+        printk("%3d: %3d has=%c mcua=0x%04X ev=0x%08X av=0x%08X c=0x%X%08X\n",
+               loop++, p->domain,
+               p->has_cpu ? 'T':'F',
+               p->mcu_advance, p->evt, p->avt,
+               (u32)(p->cpu_time>>32), (u32)p->cpu_time);
+        printk("         l: %lx n: %lx  p: %lx\n",
+               (unsigned long)list, (unsigned long)list->next,
+               (unsigned long)list->prev);
+    }
+    return; 
+}
+
+void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
+{
+    u_long   flags; 
+    s_time_t now = NOW();
+    int i;
+
+    printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
+           (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); 
+    for (i = 0; i < smp_num_cpus; i++) {
+        spin_lock_irqsave(&schedule_data[i].lock, flags);
+        printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
+        dump_rqueue(&schedule_data[i].runqueue, "rq"); 
+        spin_unlock_irqrestore(&schedule_data[i].lock, flags);
+    }
+    return; 
+}
+
diff --git a/xen/include/xeno/ac_timer.h b/xen/include/xeno/ac_timer.h
index 7cf568d2fc..280f377d17 100644
--- a/xen/include/xeno/ac_timer.h
+++ b/xen/include/xeno/ac_timer.h
@@ -43,10 +43,10 @@
  */
 
 struct ac_timer {
-	struct list_head timer_list;
-	s_time_t         expires;	/* system time time out value */
-	unsigned long    data;
-	void             (*function)(unsigned long);
+    struct list_head timer_list;
+    s_time_t         expires;   /* system time time out value */
+    unsigned long    data;
+    void             (*function)(unsigned long);
 };
 
 /* interface for "clients" */
@@ -55,7 +55,7 @@ extern int rem_ac_timer(struct ac_timer *timer);
 extern int mod_ac_timer(struct ac_timer *timer, s_time_t new_time);
 static inline void init_ac_timer(struct ac_timer *timer)
 {
-	//timer->next = NULL;
+    timer->timer_list.next = NULL;
 }
 
 /* interface used by programmable timer, implemented hardware dependent */
diff --git a/xen/include/xeno/dom0_ops.h b/xen/include/xeno/dom0_ops.h
index 5e498de1bc..c0159d12cc 100644
--- a/xen/include/xeno/dom0_ops.h
+++ b/xen/include/xeno/dom0_ops.h
@@ -4,8 +4,11 @@
  * Process command requests from domain-0 guest OS.
  * 
  * Copyright (c) 2002, K A Fraser, B Dragovic
+ * 
+ * MUST BE KEPT IN SYNC WITH xenolinux<*>/arch/xeno/drivers/dom0/dom0_ops.h
  */
 
+
 #ifndef __DOM0_OPS_H__
 #define __DOM0_OPS_H__
 
@@ -13,6 +16,8 @@
 #define DOM0_KILLDOMAIN  1
 #define DOM0_GETMEMLIST  2
 #define DOM0_STARTDOM    4
+#define DOM0_BVTCTL      6
+#define DOM0_ADJUSTDOM   7
 
 #define MAX_CMD_LEN    256
 
@@ -48,6 +53,20 @@ typedef struct domain_launch
     char cmd_line[MAX_CMD_LEN];
 } dom_meminfo_t;
 
+typedef struct dom0_bvtctl_st
+{
+	unsigned long ctx_allow;	/* context switch allowance */
+} dom0_bvtctl_t;
+
+typedef struct dom0_adjustdom_st
+{
+    unsigned int  domain;	/* domain id */
+	unsigned long mcu_adv;	/* mcu advance: inverse of weight */
+	unsigned long warp;     /* time warp */
+	unsigned long warpl;    /* warp limit */
+	unsigned long warpu;    /* unwarp time requirement */
+} dom0_adjustdom_t;
+
 typedef struct dom0_op_st
 {
     unsigned long cmd;
@@ -56,6 +75,8 @@ typedef struct dom0_op_st
         dom0_newdomain_t newdomain;
         dom0_killdomain_t killdomain;
         dom0_getmemlist_t getmemlist;
+		dom0_bvtctl_t bvtctl;
+		dom0_adjustdom_t adjustdom;
         dom_meminfo_t meminfo;
     }
     u;
diff --git a/xen/include/xeno/perfc_defn.h b/xen/include/xeno/perfc_defn.h
index 41dd48f73b..f006079085 100644
--- a/xen/include/xeno/perfc_defn.h
+++ b/xen/include/xeno/perfc_defn.h
@@ -1,10 +1,13 @@
-
 PERFCOUNTER_CPU( irqs,         "#interrupts" )
 PERFCOUNTER_CPU( irq_time,     "cycles spent in irq handler" )
 
-PERFCOUNTER( blockio_tx, "block io: messages received from tx queue" )
-PERFCOUNTER( blockio_rx, "block io: messages sent on rx queue" )
+PERFCOUNTER( blockio_tx,       "block io: messages received from tx queue" )
+PERFCOUNTER( blockio_rx,       "block io: messages sent on rx queue" )
 
-PERFCOUNTER_CPU( apic_timer,    "apic timer interrupts" )
-PERFCOUNTER_CPU( ac_timer_max,  "ac_timer max error" )
+PERFCOUNTER_CPU( apic_timer,   "apic timer interrupts" )
+PERFCOUNTER_CPU( ac_timer_max, "ac_timer max error (ns)" )
+PERFCOUNTER_CPU( sched_irq,    "sched: timer" )
+PERFCOUNTER_CPU( sched_run1,   "sched: calls to schedule" )
+PERFCOUNTER_CPU( sched_run2,   "sched: runs through scheduler" )
+PERFCOUNTER_CPU( sched_ctx,    "sched: context switches" )
 
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index 6d1842a2ea..dbbf6a927e 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -12,6 +12,10 @@
 #include <hypervisor-ifs/hypervisor-if.h>
 #include <xeno/dom0_ops.h>
 
+#include <xeno/list.h>
+#include <xeno/time.h>
+#include <xeno/ac_timer.h>
+
 extern unsigned long volatile jiffies;
 extern rwlock_t tasklist_lock;
 
@@ -59,18 +63,48 @@ extern struct mm_struct init_mm;
 
 struct task_struct {
 
-    int processor;
-    int state;
-    int hyp_events;
-    unsigned int domain;
+    /*
+     * DO NOT CHANGE THE ORDER OF THE FOLLOWING.
+     * There offsets are hardcoded in entry.S
+     */
+
+    int processor;               /* 00: current processor */
+    int state;                   /* 04: current run state */
+    int hyp_events;              /* 08: pending events */
+    unsigned int domain;         /* 12: domain id */
 
     /* An unsafe pointer into a shared data area. */
-    shared_info_t *shared_info;
+    shared_info_t *shared_info;  /* 16: shared data area */
+
+    /*
+     * From here on things can be added and shuffled without special attention
+     */
     
     struct list_head pg_head;
     unsigned int tot_pages;     /* number of pages currently possesed */
     unsigned int max_pages;     /* max number of pages that can be possesed */
 
+    /* scheduling */
+    struct list_head run_list;      /* the run list  */
+    int              has_cpu;
+    int              policy;
+    int              counter;
+    
+    struct ac_timer blt;            /* blocked timeout */
+
+    s_time_t lastschd;              /* time this domain was last scheduled */
+    s_time_t cpu_time;              /* total CPU time received till now */
+
+    unsigned long mcu_advance;      /* inverse of weight */
+    s32  avt;                       /* actual virtual time */
+    s32  evt;                       /* effective virtual time */
+    long warp;                      /* virtual time warp */
+    long warpl;                     /* warp limit */
+    long warpu;                     /* unwarp time requirement */
+    long warped;                    /* time it ran warped last time */
+    long uwarped;                   /* time it ran unwarped last time */
+
+
     /* Network I/O */
     net_ring_t *net_ring_base;
     net_vif_t *net_vif_list[MAX_GUEST_VIFS];
@@ -85,10 +119,7 @@ struct task_struct {
     segment_t *segment_list[XEN_MAX_SEGMENTS];                        /* vhd */
     int segment_count;
 
-    int has_cpu, policy, counter;
-
-    struct list_head run_list;
-    
+    /* VM */
     struct mm_struct mm;
     /* We need this lock to check page types and frob reference counts. */
     spinlock_t page_lock;
@@ -127,7 +158,7 @@ struct task_struct {
 #define TASK_RUNNING            0
 #define TASK_INTERRUPTIBLE      1
 #define TASK_UNINTERRUPTIBLE    2
-#define TASK_WAIT				4
+#define TASK_WAIT               4
 #define TASK_DYING              16
 /* #define TASK_STOPPED            8  not really used */
 
@@ -141,6 +172,8 @@ struct task_struct {
     domain:      IDLE_DOMAIN_ID, \
     state:       TASK_RUNNING,   \
     has_cpu:     0,              \
+    evt:         0xffffffff,     \
+    avt:         0xffffffff,     \
     mm:          IDLE0_MM,       \
     addr_limit:  KERNEL_DS,      \
     active_mm:   &idle0_task.mm, \
@@ -153,7 +186,7 @@ struct task_struct {
 #define is_idle_task(_p) ((_p)->domain == IDLE_DOMAIN_ID)
 
 #ifndef IDLE0_TASK_SIZE
-#define IDLE0_TASK_SIZE	2048*sizeof(long)
+#define IDLE0_TASK_SIZE 2048*sizeof(long)
 #endif
 
 union task_union {
@@ -202,6 +235,9 @@ void scheduler_init(void);
 void schedulers_start(void);
 void sched_add_domain(struct task_struct *p);
 void sched_rem_domain(struct task_struct *p);
+long sched_bvtctl(unsigned long ctx_allow);
+long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, 
+                  unsigned long warpl, unsigned long warpu);
 int  wake_up(struct task_struct *p);
 long schedule_timeout(long timeout);
 long do_yield(void);
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile
index 4738fc0ba4..eeb3413842 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile
@@ -1,3 +1,3 @@
 O_TARGET := dom0.o
-obj-y := dom0_memory.o dom0_core.o vfr.o
+obj-y := dom0_memory.o dom0_core.o vfr.o sched_ops.o
 include $(TOPDIR)/Rules.make
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
index 6c60a93ff6..22ebd7aba0 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
@@ -4,15 +4,20 @@
  * Process command requests from domain-0 guest OS.
  * 
  * Copyright (c) 2002, K A Fraser, B Dragovic
+ * 
+ * MUST BE KEPT IN SYNC WITH xen/include/xeno/dom0_ops.h
+ * MUST BE KEPT IN SYNC WITH tools/domain_builder/dom0_ops.h
  */
 
 #define DOM0_NEWDOMAIN   0
 #define DOM0_KILLDOMAIN  1
 #define DOM0_GETMEMLIST  2
 #define DOM0_STARTDOM    4
-#define MAP_DOM_MEM      6 /* Not passed down to Xen */
-#define DO_PGUPDATES     7 /* Not passed down to Xen */
-#define MAX_CMD          8
+#define DOM0_BVTCTL      6
+#define DOM0_ADJUSTDOM   7
+#define MAP_DOM_MEM      8 /* Not passed down to Xen */
+#define DO_PGUPDATES     9 /* Not passed down to Xen */
+#define MAX_CMD         10
 
 #define MAX_CMD_LEN     256
 
@@ -20,8 +25,8 @@ typedef struct dom0_newdomain_st
 {
     unsigned int domain;
     unsigned int memory_kb;
-    unsigned int num_vifs;  // temporary
-    unsigned long pg_head;  // return parameter
+    unsigned int num_vifs;  /* temporary */
+    unsigned long pg_head;  /* return parameter */
 } dom0_newdomain_t;
 
 typedef struct dom0_killdomain_st
@@ -37,6 +42,20 @@ typedef struct dom0_getmemlist_st
     void *buffer;
 } dom0_getmemlist_t;
 
+typedef struct dom0_bvtctl_st
+{
+    unsigned long ctx_allow;    /* context switch allowance */
+} dom0_bvtctl_t;
+
+typedef struct dom0_adjustdom_st
+{
+    unsigned int  domain;   /* domain id */
+    unsigned long mcu_adv;  /* mcu advance: inverse of weight */
+    unsigned long warp;     /* time warp */
+    unsigned long warpl;    /* warp limit */
+    unsigned long warpu;    /* unwarp time requirement */
+} dom0_adjustdom_t;
+
 /* This is entirely processed by XenoLinux */
 typedef struct dom_mem 
 {
@@ -64,6 +83,8 @@ typedef struct domain_launch
     char cmd_line[MAX_CMD_LEN];
 } dom_meminfo_t;
 
+
+
 typedef struct dom0_op_st
 {
     unsigned long cmd;
@@ -72,6 +93,8 @@ typedef struct dom0_op_st
         dom0_newdomain_t newdomain;
         dom0_killdomain_t killdomain;
         dom0_getmemlist_t getmemlist;
+        dom0_bvtctl_t bvtctl;
+        dom0_adjustdom_t adjustdom;
         dom_mem_t dommem;
         dom_pgupdate_t pgupdate;
         dom_meminfo_t meminfo;
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c
new file mode 100644
index 0000000000..9c5fce7857
--- /dev/null
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c
@@ -0,0 +1,137 @@
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
+ ****************************************************************************
+ * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
+ ****************************************************************************
+ *
+ *        File: sched_ops.c
+ *      Author: Rolf Neugebauer (neugebar@dcs.gla.ac.uk)
+ *     Changes: 
+ *              
+ *        Date: Mar 2003
+ * 
+ * Environment: XenoLinux
+ * Description: Dom0 Control interface to scheduler in Xen
+ *
+ * code based on Andy's vfr parsing code
+ *
+ * Commands understood by the interface:
+ *
+ * C <context swith allowance>
+ * S <did> <mcu advance> <warp> <warp limit> <unwarp limit>
+ *
+ ****************************************************************************
+ * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
+ ****************************************************************************
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+
+#include "dom0_ops.h"
+
+#define SCHED_ENTRY    "sched"
+extern struct proc_dir_entry *xeno_base;
+static struct proc_dir_entry *sched_pde;
+
+static unsigned char readbuf[1024];
+
+static int sched_read_proc(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{   
+    strcpy(page, readbuf);
+    *readbuf = '\0';
+    *eof = 1;
+    *start = page;
+    return strlen(page);
+}
+
+
+static int sched_write_proc(struct file *file, const char *buffer,
+                            u_long count, void *data)
+{
+    dom0_op_t op;
+
+    int ret, len;
+    int ts, te, tl; /* token start, end, and length */
+
+    /* Only admin can adjust scheduling parameters */
+    if ( !capable(CAP_SYS_ADMIN) )
+        return -EPERM;
+
+    /* parse the commands  */
+    len = count;
+    ts = te = 0;
+
+    while ( count && isspace(buffer[ts]) ) { ts++; count--; } /*skip spaces*/
+    te = ts;
+    while ( count && !isspace(buffer[te]) ) { te++; count--; } /*command end*/
+    if ( te <= ts ) goto bad;
+    tl = te - ts;
+
+    if ( strncmp(&buffer[ts], "C", tl) == 0 ) {
+        op.cmd = DOM0_BVTCTL;
+    } else if ( strncmp(&buffer[ts], "S", tl) == 0 ) {
+        op.cmd = DOM0_ADJUSTDOM;
+    } else
+        goto bad;
+
+    /* skip whitspaces and get first parameter */
+    ts = te; while ( count &&  isspace(buffer[ts]) ) { ts++; count--; }
+    te = ts; while ( count && !isspace(buffer[te]) ) { te++; count--; }
+    if ( te <= ts ) goto bad;
+    tl = te - ts;
+    if ( !isdigit(buffer[ts]) ) goto bad;
+
+    if (op.cmd == DOM0_BVTCTL) {
+        /* get context switch allowance  */
+        sscanf(&buffer[ts], "%lu", &op.u.bvtctl.ctx_allow);
+    } else if (op.cmd == DOM0_ADJUSTDOM) {
+        sscanf(&buffer[ts], "%u %lu %lu %lu %lu",
+               &op.u.adjustdom.domain,
+               &op.u.adjustdom.mcu_adv,
+               &op.u.adjustdom.warp,
+               &op.u.adjustdom.warpl,
+               &op.u.adjustdom.warpu);
+    }
+    ret = HYPERVISOR_dom0_op(&op);
+    return sizeof(op);
+    
+ bad:
+    return -EINVAL;
+
+}
+
+
+/*
+ * main scheduler interface driver driver initialization function.
+ */
+static int __init init_module(void)
+{
+    printk(KERN_ALERT "Starting Domain Scheduler Control Interface\n");
+
+    sched_pde = create_proc_entry(SCHED_ENTRY, 0600, xeno_base);
+    if ( sched_pde == NULL )
+    {
+        printk(KERN_ALERT "Unable to create dom scheduler proc entry!");
+        return -1;
+    }
+
+    sched_pde->read_proc  = sched_read_proc;
+    sched_pde->write_proc = sched_write_proc;
+
+    return 0;
+}
+
+static void __exit cleanup_module(void)
+{
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
+