1 files changed, 398 insertions, 158 deletions
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 787b43d900..ce46069167 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -11,7 +11,8 @@
  * 
  * Environment: Xen Hypervisor
  * Description: CPU scheduling
- *				partially moved from domain.c
+ *              implements A Borrowed Virtual Time scheduler.
+ *              (see Duda & Cheriton SOSP'99)
  *
  ****************************************************************************
  * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
@@ -28,6 +29,9 @@
 #include <xeno/ac_timer.h>
 #include <xeno/interrupt.h>
 
+#include <xeno/perfc.h>
+
+
 #undef SCHEDULER_TRACE
 #ifdef SCHEDULER_TRACE
 #define TRC(_x) _x
@@ -35,80 +39,106 @@
 #define TRC(_x)
 #endif
 
-/*
+
+#define MCU         (s32)MICROSECS(100)     /* Minimum unit */
+static s32 ctx_allow=(s32)MILLISECS(10);    /* context switch allowance */
+
+/*****************************************************************************
  * per CPU data for the scheduler.
- */
+ *****************************************************************************/
 typedef struct schedule_data_st
 {
-    spinlock_t lock;
-    struct list_head runqueue;
-    struct task_struct *prev, *curr;
+    spinlock_t          lock;           /* lock for protecting this */
+    struct list_head    runqueue;       /* runqueue */
+    struct task_struct *prev, *curr;    /* previous and current task */
+    struct task_struct *idle;           /* idle task for this cpu */
+    u32                 svt;            /* system virtual time. per CPU??? */
+    struct ac_timer     s_timer;        /* scheduling timer  */
+
 } __cacheline_aligned schedule_data_t;
 schedule_data_t schedule_data[NR_CPUS];
 
-static __cacheline_aligned struct ac_timer s_timer[NR_CPUS];
+struct ac_timer     v_timer;        /* scheduling timer  */
+static void virt_timer(unsigned long foo);
 
-/*
- * Some convenience functions
- */
 
-static inline void __add_to_runqueue(struct task_struct * p)
+/*****************************************************************************
+ * Some convenience functions
+ *****************************************************************************/
+/* add a task to the head of the runqueue */
+static inline void __add_to_runqueue_head(struct task_struct * p)
 {
+    
     list_add(&p->run_list, &schedule_data[p->processor].runqueue);
 }
-
-static inline void __move_last_runqueue(struct task_struct * p)
+/* add a task to the tail of the runqueue */
+static inline void __add_to_runqueue_tail(struct task_struct * p)
 {
-    list_del(&p->run_list);
     list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
 }
 
-static inline void __move_first_runqueue(struct task_struct * p)
-{
-    list_del(&p->run_list);
-    list_add(&p->run_list, &schedule_data[p->processor].runqueue);
-}
-
+/* remove a task from runqueue  */
 static inline void __del_from_runqueue(struct task_struct * p)
 {
     list_del(&p->run_list);
     p->run_list.next = NULL;
 }
-
+/* is task on run queue?  */
 static inline int __task_on_runqueue(struct task_struct *p)
 {
     return (p->run_list.next != NULL);
 }
 
+#define next_domain(p) \\
+        list_entry((p)->run_list.next, struct task_struct, run_list)
 
-/*
- * Add a new domain to the scheduler
- */
+/******************************************************************************
+* Add and remove a domain
+******************************************************************************/
 void sched_add_domain(struct task_struct *p) 
 {
-    p->state      = TASK_UNINTERRUPTIBLE;
+    p->state       = TASK_UNINTERRUPTIBLE;
+    p->mcu_advance = 10;
+
+    if (p->domain == IDLE_DOMAIN_ID) {
+        p->avt = 0xffffffff;
+        p->evt = 0xffffffff;
+        schedule_data[p->processor].idle = p;
+    } else {
+        /* set avt end evt to system virtual time */
+        p->avt         = schedule_data[p->processor].svt;
+        p->evt         = schedule_data[p->processor].svt;
+        /* RN: XXX BVT fill in other bits */
+    }
 }
 
-/*
- * Remove domain to the scheduler
- */
 void sched_rem_domain(struct task_struct *p) 
 {
     p->state = TASK_DYING;
 }
 
 
-/*
+/****************************************************************************
  * wake up a domain which had been sleeping
- */
+ ****************************************************************************/
 int wake_up(struct task_struct *p)
 {
     unsigned long flags;
     int ret = 0;
+
     spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
+
     if ( __task_on_runqueue(p) ) goto out;
+
     p->state = TASK_RUNNING;
-    __add_to_runqueue(p);
+    __add_to_runqueue_head(p);
+
+    /* set the BVT parameters */
+    if (p->avt < schedule_data[p->processor].svt)
+        p->avt = schedule_data[p->processor].svt;
+
+    p->evt = p->avt; /* RN: XXX BVT deal with warping here */
+
     ret = 1;
 
  out:
@@ -116,75 +146,57 @@ int wake_up(struct task_struct *p)
     return ret;
 }
 
-static void process_timeout(unsigned long __data)
+/****************************************************************************
+ * Domain requested scheduling operations
+ ****************************************************************************/
+long do_sched_op(void)
 {
-    struct task_struct * p = (struct task_struct *) __data;
-    wake_up(p);
+    /* XXX implement proper */
+    current->state = TASK_INTERRUPTIBLE;
+    schedule();
+    return 0;
 }
 
-long schedule_timeout(long timeout)
+/****************************************************************************
+ * Control the scheduler
+ ****************************************************************************/
+long sched_bvtctl(unsigned long c_allow)
 {
-    struct timer_list timer;
-    unsigned long expire;
-    
-    switch (timeout)
-    {
-    case MAX_SCHEDULE_TIMEOUT:
-        /*
-         * These two special cases are useful to be comfortable in the caller.
-         * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
-         * negative value but I' d like to return a valid offset (>=0) to allow
-         * the caller to do everything it want with the retval.
-         */
-        schedule();
-        goto out;
-    default:
-        /*
-         * Another bit of PARANOID. Note that the retval will be 0 since no
-         * piece of kernel is supposed to do a check for a negative retval of
-         * schedule_timeout() (since it should never happens anyway). You just
-         * have the printk() that will tell you if something is gone wrong and
-         * where.
-         */
-        if (timeout < 0)
-        {
-            printk(KERN_ERR "schedule_timeout: wrong timeout "
-                   "value %lx from %p\n", timeout,
-                   __builtin_return_address(0));
-            current->state = TASK_RUNNING;
-            goto out;
-        }
-    }
-    
-    expire = timeout + jiffies;
-    
-    init_timer(&timer);
-    timer.expires = expire;
-    timer.data = (unsigned long) current;
-    timer.function = process_timeout;
-    
-    add_timer(&timer);
-    schedule();
-    del_timer_sync(&timer);
-    
-    timeout = expire - jiffies;
-    
- out:
-    return timeout < 0 ? 0 : timeout;
+    printk("sched: bvtctl %lu\n", c_allow);
+    ctx_allow = c_allow;
+    return 0;
 }
 
-/* RN: XXX turn this into do_halt() */
-/*
- * yield the current process
- */
-long do_sched_op(void)
+/****************************************************************************
+ * Adjust scheduling parameter for a given domain
+ ****************************************************************************/
+long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, 
+                 unsigned long warpl, unsigned long warpu)
 {
-    current->state = TASK_INTERRUPTIBLE;
-    schedule();
+    struct task_struct *p;
+
+    printk("sched: adjdom %02d %lu %lu %lu %lu\n",
+           dom, mcu_adv, warp, warpl, warpu);
+
+    p = find_domain_by_id(dom);
+    if ( p == NULL ) return -ESRCH;
+
+    spin_lock_irq(&schedule_data[p->processor].lock);   
+
+    p->mcu_advance = mcu_adv;
+
+    spin_unlock_irq(&schedule_data[p->processor].lock); 
+
     return 0;
 }
 
-
+/****************************************************************************
+ * cause a run through the scheduler when appropriate
+ * Appropriate is:
+ * - current task is idle task
+ * - new processes evt is lower than current one
+ * - the current task already ran for it's context switch allowance
+ ****************************************************************************/
 void reschedule(struct task_struct *p)
 {
     int cpu = p->processor;
@@ -192,16 +204,20 @@ void reschedule(struct task_struct *p)
     unsigned long flags;
 
     if (p->has_cpu)
-		return;
+        return;
 
     spin_lock_irqsave(&schedule_data[cpu].lock, flags);
     curr = schedule_data[cpu].curr;
-    if (is_idle_task(curr)) {
+
+    if ( is_idle_task(curr) ||
+         (p->evt < curr->evt) ||
+         (curr->lastschd + ctx_allow >= NOW()) ) {
+        /* reschedule */
         set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
         spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
 #ifdef CONFIG_SMP
         if (cpu != smp_processor_id())
-			smp_send_event_check_cpu(cpu);
+            smp_send_event_check_cpu(cpu);
 #endif
     } else {
         spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
@@ -209,47 +225,154 @@ void reschedule(struct task_struct *p)
 }
 
 
-/*
- * Pick the next domain to run
- */
-
+/**************************************************************************** 
+ * The main function
+ * - deschedule the current domain.
+ * - pick a new domain.
+ *   i.e., the domain with lowest EVT.
+ *   The runqueue should be ordered by EVT so that is easy.
+ ****************************************************************************/
 asmlinkage void schedule(void)
 {
-    struct task_struct *prev, *next, *p;
-    struct list_head *tmp;
-    int this_cpu;
-
+    struct task_struct *prev, *next, *next_prime, *p;
+    struct list_head   *tmp;
+    int                 this_cpu;
+    s_time_t            now;
+    s32                 r_time;     /* time for new dom to run */
+    s32                 ranfor;     /* assume we never run longer than 2.1s! */
+    s32                 mcus;
+    u32                 next_evt, next_prime_evt, min_avt;
+
+    perfc_incrc(sched_run1);
  need_resched_back:
+    perfc_incrc(sched_run2);
+
+    now = NOW();
+    next = NULL;
     prev = current;
     this_cpu = prev->processor;
 
+    /* remove timer  */
+    rem_ac_timer(&schedule_data[this_cpu].s_timer);
+
+    /*
+     * deschedule the current domain
+     */
+
     spin_lock_irq(&schedule_data[this_cpu].lock);
 
     ASSERT(!in_interrupt());
     ASSERT(__task_on_runqueue(prev));
 
-	__move_last_runqueue(prev);
+    if (is_idle_task(prev)) 
+        goto deschedule_done;
 
-    switch ( prev->state )
-    {
+    /* do some accounting */
+    ranfor = (s32)(now - prev->lastschd);
+    ASSERT((ranfor>0));
+    prev->cpu_time += ranfor;
+    
+    /* calculate mcu and update avt */
+    mcus = ranfor/MCU;
+    if (ranfor % MCU) mcus ++;  /* always round up */
+    prev->avt += mcus * prev->mcu_advance;
+    prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */
+
+    /* dequeue */
+    __del_from_runqueue(prev);
+    switch (prev->state) {
     case TASK_INTERRUPTIBLE:
-        if ( signal_pending(prev) )
-        {
-            prev->state = TASK_RUNNING;
+        if (signal_pending(prev)) {
+            prev->state = TASK_RUNNING; /* but has events pending */
             break;
         }
+    case TASK_UNINTERRUPTIBLE:
+    case TASK_WAIT:
+    case TASK_DYING:
     default:
-        __del_from_runqueue(prev);
+        /* done if not running. Else, continue */
+        goto deschedule_done;
     case TASK_RUNNING:;
     }
+
+    /* requeue */
+    __add_to_runqueue_tail(prev);
+    
+
+ deschedule_done:
     clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
 
-    next = NULL;
-    list_for_each(tmp, &schedule_data[smp_processor_id()].runqueue) {
+    /*
+     * Pick a new domain
+     */
+
+    /* we should at least have the idle task */
+    ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
+
+    /*
+     * scan through the run queue and pick the task with the lowest evt
+     * *and* the task the second lowest evt.
+     * this code is O(n) but we expect n to be small.
+     */
+    next       = schedule_data[this_cpu].idle;
+    next_prime = NULL;
+
+    next_evt       = 0xffffffff;
+    next_prime_evt = 0xffffffff;
+    min_avt        = 0xffffffff;    /* to calculate svt */
+
+
+    list_for_each(tmp, &schedule_data[this_cpu].runqueue) {
         p = list_entry(tmp, struct task_struct, run_list);
-        next = p;
-        if ( !is_idle_task(next) ) break;
+        if (p->evt < next_evt) {
+            next_prime     = next;
+            next_prime_evt = next_evt;
+            next = p;
+            next_evt = p->evt;
+        } else if (next_prime_evt == 0xffffffff) {
+            next_prime_evt = p->evt;
+            next_prime     = p;
+        } else if (p->evt < next_prime_evt) {
+            next_prime_evt = p->evt;
+            next_prime     = p;
+        }
+        /* determine system virtual time */
+        if (p->avt < min_avt)
+            min_avt = p->avt;
     }
+    ASSERT(next != NULL);   /* we should have at least the idle task */
+
+    /* update system virtual time  */
+    if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt;
+
+    if (is_idle_task(next)) {
+        r_time = ctx_allow;
+        goto sched_done;
+    }
+
+    if (next_prime == NULL || is_idle_task(next_prime)) {
+        /* we have only one runable task besides the idle task */
+        r_time = 10 * ctx_allow;     /* RN: random constant */
+        goto sched_done;
+    }
+
+    /*
+     * if we are here we have two runable tasks.
+     * work out how long 'next' can run till its evt is greater than
+     * 'next_prime's evt. Taking context switch allowance into account.
+     */
+    ASSERT(next_prime->evt > next->evt);
+    r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
+
+ sched_done:
+    ASSERT(r_time != 0);
+    ASSERT(r_time > ctx_allow);
+
+    if ( (r_time==0) || (r_time < ctx_allow)) {
+        printk("[%02d]: %lx\n", this_cpu, r_time);
+        dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
+    }
+
 
     prev->has_cpu = 0;
     next->has_cpu = 1;
@@ -257,6 +380,17 @@ asmlinkage void schedule(void)
     schedule_data[this_cpu].prev = prev;
     schedule_data[this_cpu].curr = next;
 
+    next->lastschd = now;
+
+    /* reprogramm the timer */
+ timer_redo:
+    schedule_data[this_cpu].s_timer.expires  = now + r_time;
+    if (add_ac_timer(&schedule_data[this_cpu].s_timer) == 1) {
+        printk("SCHED[%02d]: Shit this shouldn't happen\n", this_cpu);
+        now = NOW();
+        goto timer_redo;
+    }
+
     spin_unlock_irq(&schedule_data[this_cpu].lock);
 
     if ( unlikely(prev == next) )
@@ -266,6 +400,8 @@ asmlinkage void schedule(void)
         goto same_process;
     }
 
+    perfc_incrc(sched_ctx);
+
     prepare_to_switch();
     switch_to(prev, next);
     prev = schedule_data[this_cpu].prev;
@@ -274,67 +410,56 @@ asmlinkage void schedule(void)
     if ( prev->state == TASK_DYING ) release_task(prev);
 
  same_process:
+    /* update the domains notion of time  */
     update_dom_time(current->shared_info);
 
-    if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) )
+    if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) ) {
         goto need_resched_back;
+    }
     return;
 }
 
 /*
- * The scheduling timer.
+ * The scheduler timer.
  */
-static __cacheline_aligned int count[NR_CPUS];
 static void sched_timer(unsigned long foo)
 {
-    int 				cpu  = smp_processor_id();
+    int                 cpu  = smp_processor_id();
     struct task_struct *curr = schedule_data[cpu].curr;
-    s_time_t			now;
-    int 				res;
-
-    /* reschedule after each 5 ticks */
-    if (count[cpu] >= 5) {
-        set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
-        count[cpu] = 0;
-    }
-    count[cpu]++;
+    /* cause a reschedule */
+    set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
+    perfc_incrc(sched_irq);
+}
 
-    /*
-     * deliver virtual timer interrups to domains if we are CPU 0 XXX RN: We
-     * don't have a per CPU list of domains yet. Otherwise would use that.
-     * Plus, this should be removed anyway once Domains "know" about virtual
-     * time and timeouts. But, it's better here then where it was before.
-     */
-    if (cpu == 0) {
-        struct task_struct *p;
-        unsigned long cpu_mask = 0;
-
-        /* send virtual timer interrupt */
-        read_lock(&tasklist_lock);
-        p = &idle0_task;
-        do {
-            if ( is_idle_task(p) ) continue;
-            cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
-        }
-        while ( (p = p->next_task) != &idle0_task );
-        read_unlock(&tasklist_lock);
-        guest_event_notify(cpu_mask);
+/*
+ * The Domain virtual time timer
+ */
+static void virt_timer(unsigned long foo)
+{
+    unsigned long cpu_mask = 0;
+    struct task_struct *p;
+    s_time_t now;
+    int res;
+
+    /* send virtual timer interrupt */
+    read_lock(&tasklist_lock);
+    p = &idle0_task;
+    do {
+        if ( is_idle_task(p) ) continue;
+        cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
     }
+    while ( (p = p->next_task) != &idle0_task );
+    read_unlock(&tasklist_lock);
+    guest_event_notify(cpu_mask);
 
- again:
+    again:
     now = NOW();
-    s_timer[cpu].expires  = now + MILLISECS(10);
-    res=add_ac_timer(&s_timer[cpu]);
-
-    TRC(printk("SCHED[%02d] timer(): now=0x%08X%08X timo=0x%08X%08X\n",
-               cpu, (u32)(now>>32), (u32)now,
-               (u32)(s_timer[cpu].expires>>32), (u32)s_timer[cpu].expires));
+    v_timer.expires  = now + MILLISECS(10);
+    res=add_ac_timer(&v_timer);
     if (res==1)
         goto again;
-
 }
 
-
 /*
  * Initialise the data structures
  */
@@ -350,11 +475,15 @@ void __init scheduler_init(void)
         spin_lock_init(&schedule_data[i].lock);
         schedule_data[i].prev = &idle0_task;
         schedule_data[i].curr = &idle0_task;
-		
+        
         /* a timer for each CPU  */
-        init_ac_timer(&s_timer[i]);
-        s_timer[i].function = &sched_timer;
+        init_ac_timer(&schedule_data[i].s_timer);
+        schedule_data[i].s_timer.function = &sched_timer;
+
     }
+    schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */
+    init_ac_timer(&v_timer);
+    v_timer.function = &virt_timer;
 }
 
 /*
@@ -362,10 +491,121 @@ void __init scheduler_init(void)
  * This has to be done *after* the timers, e.g., APICs, have been initialised
  */
 void schedulers_start(void) 
-{	
+{   
     printk("Start schedulers\n");
     __cli();
     sched_timer(0);
+    virt_timer(0);
     smp_call_function((void *)sched_timer, NULL, 1, 1);
     __sti();
 }
+
+
+/****************************************************************************
+ * Functions for legacy support. 
+ * Schedule timeout is used at a number of places and is a bit meaningless 
+ * in the context of Xen, as Domains are not able to call these and all 
+ * there entry points into Xen should be asynchronous. If a domain wishes
+ * to block for a while it should use Xen's sched_op entry point.
+ ****************************************************************************/
+
+static void process_timeout(unsigned long __data)
+{
+    struct task_struct * p = (struct task_struct *) __data;
+    wake_up(p);
+}
+
+long schedule_timeout(long timeout)
+{
+    struct timer_list timer;
+    unsigned long expire;
+    
+    switch (timeout)
+    {
+    case MAX_SCHEDULE_TIMEOUT:
+        /*
+         * These two special cases are useful to be comfortable in the caller.
+         * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
+         * negative value but I' d like to return a valid offset (>=0) to allow
+         * the caller to do everything it want with the retval.
+         */
+        schedule();
+        goto out;
+    default:
+        /*
+         * Another bit of PARANOID. Note that the retval will be 0 since no
+         * piece of kernel is supposed to do a check for a negative retval of
+         * schedule_timeout() (since it should never happens anyway). You just
+         * have the printk() that will tell you if something is gone wrong and
+         * where.
+         */
+        if (timeout < 0)
+        {
+            printk(KERN_ERR "schedule_timeout: wrong timeout "
+                   "value %lx from %p\n", timeout,
+                   __builtin_return_address(0));
+            current->state = TASK_RUNNING;
+            goto out;
+        }
+    }
+    
+    expire = timeout + jiffies;
+    
+    init_timer(&timer);
+    timer.expires = expire;
+    timer.data = (unsigned long) current;
+    timer.function = process_timeout;
+    
+    add_timer(&timer);
+    schedule();
+    del_timer_sync(&timer);
+    
+    timeout = expire - jiffies;
+    
+ out:
+    return timeout < 0 ? 0 : timeout;
+}
+
+/****************************************************************************
+ * debug function
+ ****************************************************************************/
+
+static void dump_rqueue(struct list_head *queue, char *name)
+{
+    struct list_head *list;
+    int loop = 0;
+    struct task_struct  *p;
+
+    printk ("QUEUE %s %lx   n: %lx, p: %lx\n", name,  (unsigned long)queue,
+            (unsigned long) queue->next, (unsigned long) queue->prev);
+    list_for_each (list, queue) {
+        p = list_entry(list, struct task_struct, run_list);
+        printk("%3d: %3d has=%c mcua=0x%04X ev=0x%08X av=0x%08X c=0x%X%08X\n",
+               loop++, p->domain,
+               p->has_cpu ? 'T':'F',
+               p->mcu_advance, p->evt, p->avt,
+               (u32)(p->cpu_time>>32), (u32)p->cpu_time);
+        printk("         l: %lx n: %lx  p: %lx\n",
+               (unsigned long)list, (unsigned long)list->next,
+               (unsigned long)list->prev);
+    }
+    return; 
+}
+
+void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
+{
+    u_long   flags; 
+    s_time_t now = NOW();
+    int i;
+
+    printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
+           (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); 
+    for (i = 0; i < smp_num_cpus; i++) {
+        spin_lock_irqsave(&schedule_data[i].lock, flags);
+        printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
+        dump_rqueue(&schedule_data[i].runqueue, "rq"); 
+        spin_unlock_irqrestore(&schedule_data[i].lock, flags);
+    }
+    return; 
+}
+