diff options
Diffstat (limited to 'xen/common/schedule.c')
-rw-r--r-- | xen/common/schedule.c | 556 |
1 files changed, 398 insertions, 158 deletions
diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 787b43d900..ce46069167 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -11,7 +11,8 @@ * * Environment: Xen Hypervisor * Description: CPU scheduling - * partially moved from domain.c + * implements A Borrowed Virtual Time scheduler. + * (see Duda & Cheriton SOSP'99) * **************************************************************************** * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $ @@ -28,6 +29,9 @@ #include <xeno/ac_timer.h> #include <xeno/interrupt.h> +#include <xeno/perfc.h> + + #undef SCHEDULER_TRACE #ifdef SCHEDULER_TRACE #define TRC(_x) _x @@ -35,80 +39,106 @@ #define TRC(_x) #endif -/* + +#define MCU (s32)MICROSECS(100) /* Minimum unit */ +static s32 ctx_allow=(s32)MILLISECS(10); /* context switch allowance */ + +/***************************************************************************** * per CPU data for the scheduler. - */ + *****************************************************************************/ typedef struct schedule_data_st { - spinlock_t lock; - struct list_head runqueue; - struct task_struct *prev, *curr; + spinlock_t lock; /* lock for protecting this */ + struct list_head runqueue; /* runqueue */ + struct task_struct *prev, *curr; /* previous and current task */ + struct task_struct *idle; /* idle task for this cpu */ + u32 svt; /* system virtual time. per CPU??? */ + struct ac_timer s_timer; /* scheduling timer */ + } __cacheline_aligned schedule_data_t; schedule_data_t schedule_data[NR_CPUS]; -static __cacheline_aligned struct ac_timer s_timer[NR_CPUS]; +struct ac_timer v_timer; /* scheduling timer */ +static void virt_timer(unsigned long foo); -/* - * Some convenience functions - */ -static inline void __add_to_runqueue(struct task_struct * p) +/***************************************************************************** + * Some convenience functions + *****************************************************************************/ +/* add a task to the head of the runqueue */ +static inline void __add_to_runqueue_head(struct task_struct * p) { + list_add(&p->run_list, &schedule_data[p->processor].runqueue); } - -static inline void __move_last_runqueue(struct task_struct * p) +/* add a task to the tail of the runqueue */ +static inline void __add_to_runqueue_tail(struct task_struct * p) { - list_del(&p->run_list); list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue); } -static inline void __move_first_runqueue(struct task_struct * p) -{ - list_del(&p->run_list); - list_add(&p->run_list, &schedule_data[p->processor].runqueue); -} - +/* remove a task from runqueue */ static inline void __del_from_runqueue(struct task_struct * p) { list_del(&p->run_list); p->run_list.next = NULL; } - +/* is task on run queue? */ static inline int __task_on_runqueue(struct task_struct *p) { return (p->run_list.next != NULL); } +#define next_domain(p) \\ + list_entry((p)->run_list.next, struct task_struct, run_list) -/* - * Add a new domain to the scheduler - */ +/****************************************************************************** +* Add and remove a domain +******************************************************************************/ void sched_add_domain(struct task_struct *p) { - p->state = TASK_UNINTERRUPTIBLE; + p->state = TASK_UNINTERRUPTIBLE; + p->mcu_advance = 10; + + if (p->domain == IDLE_DOMAIN_ID) { + p->avt = 0xffffffff; + p->evt = 0xffffffff; + schedule_data[p->processor].idle = p; + } else { + /* set avt end evt to system virtual time */ + p->avt = schedule_data[p->processor].svt; + p->evt = schedule_data[p->processor].svt; + /* RN: XXX BVT fill in other bits */ + } } -/* - * Remove domain to the scheduler - */ void sched_rem_domain(struct task_struct *p) { p->state = TASK_DYING; } -/* +/**************************************************************************** * wake up a domain which had been sleeping - */ + ****************************************************************************/ int wake_up(struct task_struct *p) { unsigned long flags; int ret = 0; + spin_lock_irqsave(&schedule_data[p->processor].lock, flags); + if ( __task_on_runqueue(p) ) goto out; + p->state = TASK_RUNNING; - __add_to_runqueue(p); + __add_to_runqueue_head(p); + + /* set the BVT parameters */ + if (p->avt < schedule_data[p->processor].svt) + p->avt = schedule_data[p->processor].svt; + + p->evt = p->avt; /* RN: XXX BVT deal with warping here */ + ret = 1; out: @@ -116,75 +146,57 @@ int wake_up(struct task_struct *p) return ret; } -static void process_timeout(unsigned long __data) +/**************************************************************************** + * Domain requested scheduling operations + ****************************************************************************/ +long do_sched_op(void) { - struct task_struct * p = (struct task_struct *) __data; - wake_up(p); + /* XXX implement proper */ + current->state = TASK_INTERRUPTIBLE; + schedule(); + return 0; } -long schedule_timeout(long timeout) +/**************************************************************************** + * Control the scheduler + ****************************************************************************/ +long sched_bvtctl(unsigned long c_allow) { - struct timer_list timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable in the caller. - * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the - * negative value but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be 0 since no - * piece of kernel is supposed to do a check for a negative retval of - * schedule_timeout() (since it should never happens anyway). You just - * have the printk() that will tell you if something is gone wrong and - * where. - */ - if (timeout < 0) - { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); - current->state = TASK_RUNNING; - goto out; - } - } - - expire = timeout + jiffies; - - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; - - add_timer(&timer); - schedule(); - del_timer_sync(&timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; + printk("sched: bvtctl %lu\n", c_allow); + ctx_allow = c_allow; + return 0; } -/* RN: XXX turn this into do_halt() */ -/* - * yield the current process - */ -long do_sched_op(void) +/**************************************************************************** + * Adjust scheduling parameter for a given domain + ****************************************************************************/ +long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, + unsigned long warpl, unsigned long warpu) { - current->state = TASK_INTERRUPTIBLE; - schedule(); + struct task_struct *p; + + printk("sched: adjdom %02d %lu %lu %lu %lu\n", + dom, mcu_adv, warp, warpl, warpu); + + p = find_domain_by_id(dom); + if ( p == NULL ) return -ESRCH; + + spin_lock_irq(&schedule_data[p->processor].lock); + + p->mcu_advance = mcu_adv; + + spin_unlock_irq(&schedule_data[p->processor].lock); + return 0; } - +/**************************************************************************** + * cause a run through the scheduler when appropriate + * Appropriate is: + * - current task is idle task + * - new processes evt is lower than current one + * - the current task already ran for it's context switch allowance + ****************************************************************************/ void reschedule(struct task_struct *p) { int cpu = p->processor; @@ -192,16 +204,20 @@ void reschedule(struct task_struct *p) unsigned long flags; if (p->has_cpu) - return; + return; spin_lock_irqsave(&schedule_data[cpu].lock, flags); curr = schedule_data[cpu].curr; - if (is_idle_task(curr)) { + + if ( is_idle_task(curr) || + (p->evt < curr->evt) || + (curr->lastschd + ctx_allow >= NOW()) ) { + /* reschedule */ set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); spin_unlock_irqrestore(&schedule_data[cpu].lock, flags); #ifdef CONFIG_SMP if (cpu != smp_processor_id()) - smp_send_event_check_cpu(cpu); + smp_send_event_check_cpu(cpu); #endif } else { spin_unlock_irqrestore(&schedule_data[cpu].lock, flags); @@ -209,47 +225,154 @@ void reschedule(struct task_struct *p) } -/* - * Pick the next domain to run - */ - +/**************************************************************************** + * The main function + * - deschedule the current domain. + * - pick a new domain. + * i.e., the domain with lowest EVT. + * The runqueue should be ordered by EVT so that is easy. + ****************************************************************************/ asmlinkage void schedule(void) { - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu; - + struct task_struct *prev, *next, *next_prime, *p; + struct list_head *tmp; + int this_cpu; + s_time_t now; + s32 r_time; /* time for new dom to run */ + s32 ranfor; /* assume we never run longer than 2.1s! */ + s32 mcus; + u32 next_evt, next_prime_evt, min_avt; + + perfc_incrc(sched_run1); need_resched_back: + perfc_incrc(sched_run2); + + now = NOW(); + next = NULL; prev = current; this_cpu = prev->processor; + /* remove timer */ + rem_ac_timer(&schedule_data[this_cpu].s_timer); + + /* + * deschedule the current domain + */ + spin_lock_irq(&schedule_data[this_cpu].lock); ASSERT(!in_interrupt()); ASSERT(__task_on_runqueue(prev)); - __move_last_runqueue(prev); + if (is_idle_task(prev)) + goto deschedule_done; - switch ( prev->state ) - { + /* do some accounting */ + ranfor = (s32)(now - prev->lastschd); + ASSERT((ranfor>0)); + prev->cpu_time += ranfor; + + /* calculate mcu and update avt */ + mcus = ranfor/MCU; + if (ranfor % MCU) mcus ++; /* always round up */ + prev->avt += mcus * prev->mcu_advance; + prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */ + + /* dequeue */ + __del_from_runqueue(prev); + switch (prev->state) { case TASK_INTERRUPTIBLE: - if ( signal_pending(prev) ) - { - prev->state = TASK_RUNNING; + if (signal_pending(prev)) { + prev->state = TASK_RUNNING; /* but has events pending */ break; } + case TASK_UNINTERRUPTIBLE: + case TASK_WAIT: + case TASK_DYING: default: - __del_from_runqueue(prev); + /* done if not running. Else, continue */ + goto deschedule_done; case TASK_RUNNING:; } + + /* requeue */ + __add_to_runqueue_tail(prev); + + + deschedule_done: clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events); - next = NULL; - list_for_each(tmp, &schedule_data[smp_processor_id()].runqueue) { + /* + * Pick a new domain + */ + + /* we should at least have the idle task */ + ASSERT(!list_empty(&schedule_data[this_cpu].runqueue)); + + /* + * scan through the run queue and pick the task with the lowest evt + * *and* the task the second lowest evt. + * this code is O(n) but we expect n to be small. + */ + next = schedule_data[this_cpu].idle; + next_prime = NULL; + + next_evt = 0xffffffff; + next_prime_evt = 0xffffffff; + min_avt = 0xffffffff; /* to calculate svt */ + + + list_for_each(tmp, &schedule_data[this_cpu].runqueue) { p = list_entry(tmp, struct task_struct, run_list); - next = p; - if ( !is_idle_task(next) ) break; + if (p->evt < next_evt) { + next_prime = next; + next_prime_evt = next_evt; + next = p; + next_evt = p->evt; + } else if (next_prime_evt == 0xffffffff) { + next_prime_evt = p->evt; + next_prime = p; + } else if (p->evt < next_prime_evt) { + next_prime_evt = p->evt; + next_prime = p; + } + /* determine system virtual time */ + if (p->avt < min_avt) + min_avt = p->avt; } + ASSERT(next != NULL); /* we should have at least the idle task */ + + /* update system virtual time */ + if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt; + + if (is_idle_task(next)) { + r_time = ctx_allow; + goto sched_done; + } + + if (next_prime == NULL || is_idle_task(next_prime)) { + /* we have only one runable task besides the idle task */ + r_time = 10 * ctx_allow; /* RN: random constant */ + goto sched_done; + } + + /* + * if we are here we have two runable tasks. + * work out how long 'next' can run till its evt is greater than + * 'next_prime's evt. Taking context switch allowance into account. + */ + ASSERT(next_prime->evt > next->evt); + r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow; + + sched_done: + ASSERT(r_time != 0); + ASSERT(r_time > ctx_allow); + + if ( (r_time==0) || (r_time < ctx_allow)) { + printk("[%02d]: %lx\n", this_cpu, r_time); + dump_rqueue(&schedule_data[this_cpu].runqueue, "foo"); + } + prev->has_cpu = 0; next->has_cpu = 1; @@ -257,6 +380,17 @@ asmlinkage void schedule(void) schedule_data[this_cpu].prev = prev; schedule_data[this_cpu].curr = next; + next->lastschd = now; + + /* reprogramm the timer */ + timer_redo: + schedule_data[this_cpu].s_timer.expires = now + r_time; + if (add_ac_timer(&schedule_data[this_cpu].s_timer) == 1) { + printk("SCHED[%02d]: Shit this shouldn't happen\n", this_cpu); + now = NOW(); + goto timer_redo; + } + spin_unlock_irq(&schedule_data[this_cpu].lock); if ( unlikely(prev == next) ) @@ -266,6 +400,8 @@ asmlinkage void schedule(void) goto same_process; } + perfc_incrc(sched_ctx); + prepare_to_switch(); switch_to(prev, next); prev = schedule_data[this_cpu].prev; @@ -274,67 +410,56 @@ asmlinkage void schedule(void) if ( prev->state == TASK_DYING ) release_task(prev); same_process: + /* update the domains notion of time */ update_dom_time(current->shared_info); - if ( test_bit(_HYP_EVENT_NEED_RESCHED, ¤t->hyp_events) ) + if ( test_bit(_HYP_EVENT_NEED_RESCHED, ¤t->hyp_events) ) { goto need_resched_back; + } return; } /* - * The scheduling timer. + * The scheduler timer. */ -static __cacheline_aligned int count[NR_CPUS]; static void sched_timer(unsigned long foo) { - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct task_struct *curr = schedule_data[cpu].curr; - s_time_t now; - int res; - - /* reschedule after each 5 ticks */ - if (count[cpu] >= 5) { - set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); - count[cpu] = 0; - } - count[cpu]++; + /* cause a reschedule */ + set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); + perfc_incrc(sched_irq); +} - /* - * deliver virtual timer interrups to domains if we are CPU 0 XXX RN: We - * don't have a per CPU list of domains yet. Otherwise would use that. - * Plus, this should be removed anyway once Domains "know" about virtual - * time and timeouts. But, it's better here then where it was before. - */ - if (cpu == 0) { - struct task_struct *p; - unsigned long cpu_mask = 0; - - /* send virtual timer interrupt */ - read_lock(&tasklist_lock); - p = &idle0_task; - do { - if ( is_idle_task(p) ) continue; - cpu_mask |= mark_guest_event(p, _EVENT_TIMER); - } - while ( (p = p->next_task) != &idle0_task ); - read_unlock(&tasklist_lock); - guest_event_notify(cpu_mask); +/* + * The Domain virtual time timer + */ +static void virt_timer(unsigned long foo) +{ + unsigned long cpu_mask = 0; + struct task_struct *p; + s_time_t now; + int res; + + /* send virtual timer interrupt */ + read_lock(&tasklist_lock); + p = &idle0_task; + do { + if ( is_idle_task(p) ) continue; + cpu_mask |= mark_guest_event(p, _EVENT_TIMER); } + while ( (p = p->next_task) != &idle0_task ); + read_unlock(&tasklist_lock); + guest_event_notify(cpu_mask); - again: + again: now = NOW(); - s_timer[cpu].expires = now + MILLISECS(10); - res=add_ac_timer(&s_timer[cpu]); - - TRC(printk("SCHED[%02d] timer(): now=0x%08X%08X timo=0x%08X%08X\n", - cpu, (u32)(now>>32), (u32)now, - (u32)(s_timer[cpu].expires>>32), (u32)s_timer[cpu].expires)); + v_timer.expires = now + MILLISECS(10); + res=add_ac_timer(&v_timer); if (res==1) goto again; - } - /* * Initialise the data structures */ @@ -350,11 +475,15 @@ void __init scheduler_init(void) spin_lock_init(&schedule_data[i].lock); schedule_data[i].prev = &idle0_task; schedule_data[i].curr = &idle0_task; - + /* a timer for each CPU */ - init_ac_timer(&s_timer[i]); - s_timer[i].function = &sched_timer; + init_ac_timer(&schedule_data[i].s_timer); + schedule_data[i].s_timer.function = &sched_timer; + } + schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */ + init_ac_timer(&v_timer); + v_timer.function = &virt_timer; } /* @@ -362,10 +491,121 @@ void __init scheduler_init(void) * This has to be done *after* the timers, e.g., APICs, have been initialised */ void schedulers_start(void) -{ +{ printk("Start schedulers\n"); __cli(); sched_timer(0); + virt_timer(0); smp_call_function((void *)sched_timer, NULL, 1, 1); __sti(); } + + +/**************************************************************************** + * Functions for legacy support. + * Schedule timeout is used at a number of places and is a bit meaningless + * in the context of Xen, as Domains are not able to call these and all + * there entry points into Xen should be asynchronous. If a domain wishes + * to block for a while it should use Xen's sched_op entry point. + ****************************************************************************/ + +static void process_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + wake_up(p); +} + +long schedule_timeout(long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable in the caller. + * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the + * negative value but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be 0 since no + * piece of kernel is supposed to do a check for a negative retval of + * schedule_timeout() (since it should never happens anyway). You just + * have the printk() that will tell you if something is gone wrong and + * where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} + +/**************************************************************************** + * debug function + ****************************************************************************/ + +static void dump_rqueue(struct list_head *queue, char *name) +{ + struct list_head *list; + int loop = 0; + struct task_struct *p; + + printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue, + (unsigned long) queue->next, (unsigned long) queue->prev); + list_for_each (list, queue) { + p = list_entry(list, struct task_struct, run_list); + printk("%3d: %3d has=%c mcua=0x%04X ev=0x%08X av=0x%08X c=0x%X%08X\n", + loop++, p->domain, + p->has_cpu ? 'T':'F', + p->mcu_advance, p->evt, p->avt, + (u32)(p->cpu_time>>32), (u32)p->cpu_time); + printk(" l: %lx n: %lx p: %lx\n", + (unsigned long)list, (unsigned long)list->next, + (unsigned long)list->prev); + } + return; +} + +void dump_runq(u_char key, void *dev_id, struct pt_regs *regs) +{ + u_long flags; + s_time_t now = NOW(); + int i; + + printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n", + (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); + for (i = 0; i < smp_num_cpus; i++) { + spin_lock_irqsave(&schedule_data[i].lock, flags); + printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt); + dump_rqueue(&schedule_data[i].runqueue, "rq"); + spin_unlock_irqrestore(&schedule_data[i].lock, flags); + } + return; +} + |