From a16eb6b1a16f60f8616b37996d6da32fcdcfecec Mon Sep 17 00:00:00 2001 From: "rn@wyvis.research.intel-research.net" Date: Fri, 14 Mar 2003 15:43:32 +0000 Subject: bitkeeper revision 1.124 (3e71f8a4QvveKwitZNAJi1H3BJpPEQ) ac_timer.c: rewrite of do timer/add_timer + perfcounters apic.c: added perfcounter and try to disable APIC when no timeout value is zero. irq.c: count interrupts and cycles spent in them sched.h: added fields for BVT schedule.c: BVT without warping keyhandler.c: added handler for dumping run queues moved handler for ac_timers here .del-dom0_ops.h~f77c7a14cfa618f8: Delete: tools/domain_builder/dom0_ops.h --- .rootkeys | 1 - tools/domain_builder/dom0_ops.h | 81 -------- xen/arch/i386/apic.c | 8 + xen/arch/i386/irq.c | 9 + xen/common/ac_timer.c | 245 ++++++++++++---------- xen/common/keyhandler.c | 52 ++--- xen/common/schedule.c | 438 +++++++++++++++++++++++----------------- xen/include/xeno/sched.h | 53 ++--- 8 files changed, 469 insertions(+), 418 deletions(-) delete mode 100644 tools/domain_builder/dom0_ops.h diff --git a/.rootkeys b/.rootkeys index da527ca68f..9bb57b3a37 100644 --- a/.rootkeys +++ b/.rootkeys @@ -182,7 +182,6 @@ 3e4d00468-FN2VDeEHo96zxrMHK_mA tools/domain_builder/Makefile 3e4d0046SPau_y0sw2WLJz8QkqNoRA tools/domain_builder/README 3e4d0046bbdH0GsI9J_1Eb4ZQHfIiQ tools/domain_builder/dom0_defs.h -3e4d0046RgYCfGOw6qGz_7kYLMV2Vw tools/domain_builder/dom0_ops.h 3e4d0046ouLij_CMN_j7-dUHZIBI_A tools/domain_builder/dom_builder.c 3e4d0046EKs06fY0CWDEgZQcn7DYUg tools/domain_builder/dom_kill.c 3e4d0046aPbGiRTtdWxqY5b3ytWurA tools/domain_builder/hypervisor_defs.h diff --git a/tools/domain_builder/dom0_ops.h b/tools/domain_builder/dom0_ops.h deleted file mode 100644 index 6c60a93ff6..0000000000 --- a/tools/domain_builder/dom0_ops.h +++ /dev/null @@ -1,81 +0,0 @@ -/****************************************************************************** - * dom0_ops.h - * - * Process command requests from domain-0 guest OS. - * - * Copyright (c) 2002, K A Fraser, B Dragovic - */ - -#define DOM0_NEWDOMAIN 0 -#define DOM0_KILLDOMAIN 1 -#define DOM0_GETMEMLIST 2 -#define DOM0_STARTDOM 4 -#define MAP_DOM_MEM 6 /* Not passed down to Xen */ -#define DO_PGUPDATES 7 /* Not passed down to Xen */ -#define MAX_CMD 8 - -#define MAX_CMD_LEN 256 - -typedef struct dom0_newdomain_st -{ - unsigned int domain; - unsigned int memory_kb; - unsigned int num_vifs; // temporary - unsigned long pg_head; // return parameter -} dom0_newdomain_t; - -typedef struct dom0_killdomain_st -{ - unsigned int domain; - int force; -} dom0_killdomain_t; - -typedef struct dom0_getmemlist_st -{ - unsigned long start_pfn; - unsigned long num_pfns; - void *buffer; -} dom0_getmemlist_t; - -/* This is entirely processed by XenoLinux */ -typedef struct dom_mem -{ - unsigned int domain; - unsigned long vaddr; - unsigned long start_pfn; - int tot_pages; -} dom_mem_t; - -/* This is entirely processed by XenoLinux */ -typedef struct dom_pgupdate -{ - unsigned long pgt_update_arr; - unsigned long num_pgt_updates; -} dom_pgupdate_t; - -typedef struct domain_launch -{ - unsigned int domain; - unsigned long l2_pgt_addr; - unsigned long virt_load_addr; - unsigned long virt_shinfo_addr; - unsigned long virt_startinfo_addr; - unsigned int num_vifs; - char cmd_line[MAX_CMD_LEN]; -} dom_meminfo_t; - -typedef struct dom0_op_st -{ - unsigned long cmd; - union - { - dom0_newdomain_t newdomain; - dom0_killdomain_t killdomain; - dom0_getmemlist_t getmemlist; - dom_mem_t dommem; - dom_pgupdate_t pgupdate; - dom_meminfo_t meminfo; - } - u; -} dom0_op_t; - diff --git a/xen/arch/i386/apic.c b/xen/arch/i386/apic.c index 865a279d8c..0acf7067c3 100644 --- a/xen/arch/i386/apic.c +++ b/xen/arch/i386/apic.c @@ -659,6 +659,13 @@ int reprogram_ac_timer(s_time_t timeout) s_time_t expire; u64 apic_tmict; + if (timeout == 0) { + /* XXX RN: not sure if this disables it or cause interruptto + * go off imediately */ + apic_tmict = 0; + goto reprogram; + } + now = NOW(); expire = timeout - now; /* value from now */ @@ -680,6 +687,7 @@ int reprogram_ac_timer(s_time_t timeout) return 0; } + reprogram: /* programm timer */ apic_write(APIC_TMICT, (unsigned long)apic_tmict); diff --git a/xen/arch/i386/irq.c b/xen/arch/i386/irq.c index e58fb8f2ad..312cfe7970 100644 --- a/xen/arch/i386/irq.c +++ b/xen/arch/i386/irq.c @@ -36,6 +36,7 @@ #include #include +#include /* * Linux has a controller-independent x86 interrupt architecture. @@ -469,6 +470,11 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) struct irqaction * action; unsigned int status; + u32 cc_start, cc_end; + + perfc_incra(irqs, cpu); + rdtscl(cc_start); + spin_lock(&desc->lock); desc->handler->ack(irq); /* @@ -530,6 +536,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) if (softirq_pending(cpu)) do_softirq(); + rdtscl(cc_end); + perfc_adda(irq_time, cpu, cc_end - cc_start); + return 1; } diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c index 9bb5d7e301..73ac893e08 100644 --- a/xen/common/ac_timer.c +++ b/xen/common/ac_timer.c @@ -23,9 +23,9 @@ #include #include #include -#include #include -#include + +#include #include #include @@ -34,20 +34,16 @@ #include #include - -#undef AC_TIMER_TRACE -#undef AC_TIMER_STATS - #ifdef AC_TIMER_TRACE #define TRC(_x) _x #else #define TRC(_x) #endif -/* +/***************************************************************************** * We pull handlers off the timer list this far in future, * rather than reprogramming the time hardware. - */ + *****************************************************************************/ #define TIMER_SLOP (50*1000) /* ns */ /* A timer list per CPU */ @@ -55,47 +51,35 @@ typedef struct ac_timers_st { spinlock_t lock; struct list_head timers; - struct ac_timer *prev, *curr; + s_time_t max_diff; } __cacheline_aligned ac_timers_t; static ac_timers_t ac_timers[NR_CPUS]; -#ifdef AC_TIMER_STATS -#define BUCKETS 1000 -#define MAX_STATS -typedef struct act_stats_st -{ - u32 count; - u32 times[2*(BUCKETS)]; -} __cacheline_aligned act_stats_t; -static act_stats_t act_stats[NR_CPUS]; - -#endif - /* local prototypes */ static int detach_ac_timer(struct ac_timer *timer); -/*static void ac_timer_debug(unsigned long);*/ -/* + +/***************************************************************************** * add a timer. * return value: * 0: success * 1: failure, timer in the past or timeout value to small * -1: failure, timer uninitialised * fail - */ + *****************************************************************************/ int add_ac_timer(struct ac_timer *timer) { - int cpu = smp_processor_id(); - unsigned long flags; - s_time_t now; + int cpu = smp_processor_id(); + unsigned long flags; + s_time_t now; /* make sure timeout value is in the future */ - + now = NOW(); - if (timer->expires <= now) { + if (timer->expires <= now) { TRC(printk("ACT[%02d] add_ac_timer:now=0x%08X%08X>expire=0x%08X%08X\n", - cpu, (u32)(now>>32), (u32)now, - (u32)(timer->expires>>32), (u32)timer->expires)); + cpu, (u32)(now>>32), (u32)now, + (u32)(timer->expires>>32), (u32)timer->expires)); return 1; } spin_lock_irqsave(&ac_timers[cpu].lock, flags); @@ -104,79 +88,89 @@ int add_ac_timer(struct ac_timer *timer) * reprogramm the timer */ if (list_empty(&ac_timers[cpu].timers)) { - /* Reprogramm and add to head of list */ if (!reprogram_ac_timer(timer->expires)) { + printk("ACT[%02d] add at head failed\n", cpu); spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); return 1; /* failed */ } list_add(&timer->timer_list, &ac_timers[cpu].timers); } else { struct list_head *pos; - struct ac_timer *t; + struct ac_timer *t; - list_for_each(pos, &ac_timers[cpu].timers) { - t = list_entry(pos, struct ac_timer, timer_list); - if (t->expires > timer->expires) + list_for_each(pos, &ac_timers[cpu].timers) { + t = list_entry(pos, struct ac_timer, timer_list); + if (t->expires > timer->expires) break; - } - list_add (&(timer->timer_list), pos->prev); + } + list_add (&(timer->timer_list), pos->prev); - if (timer->timer_list.prev == &ac_timers[cpu].timers) { - /* added at head */ + if (timer->timer_list.prev == &ac_timers[cpu].timers) { + /* added at head */ if (!reprogram_ac_timer(timer->expires)) { - detach_ac_timer(timer); + printk("ACT[%02d] add at head failed\n", cpu); + detach_ac_timer(timer); spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); return 1; /* failed */ } - } + } } spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); return 0; } -/* - * remove a timer +/***************************************************************************** + * detach a timer (no locking) * return values: * 0: success * -1: bogus timer - */ + *****************************************************************************/ static int detach_ac_timer(struct ac_timer *timer) { - TRC(int cpu = smp_processor_id()); TRC(printk("ACT [%02d] detach(): \n", cpu)); list_del(&timer->timer_list); timer->timer_list.next = NULL; return 0; } -/* +/***************************************************************************** * remove a timer * return values: * 0: success * -1: bogus timer - */ + *****************************************************************************/ int rem_ac_timer(struct ac_timer *timer) { - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); int res = 0; unsigned long flags; TRC(printk("ACT [%02d] remove(): timo=%lld \n", cpu, timer->expires)); - spin_lock_irqsave(&ac_timers[cpu].lock, flags); - if (!timer->timer_list.next == NULL) - res = detach_ac_timer(timer); + if (timer->timer_list.next) { + res = detach_ac_timer(timer); + + if (timer->timer_list.prev == &ac_timers[cpu].timers) { + /* just removed the head */ + if (list_empty(&ac_timers[cpu].timers)) { + reprogram_ac_timer((s_time_t) 0); + } + /* XXX should actaully reprogramm APIC to new head */ + } + } else + res = -1; + spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); return res; } -/* +/***************************************************************************** * modify a timer, i.e., set a new timeout value * return value: * 0: sucess * -1: error - */ + *****************************************************************************/ int mod_ac_timer(struct ac_timer *timer, s_time_t new_time) { if (rem_ac_timer(timer) != 0) @@ -187,69 +181,59 @@ int mod_ac_timer(struct ac_timer *timer, s_time_t new_time) return 0; } -/* +/***************************************************************************** * do_ac_timer * deal with timeouts and run the handlers - */ + *****************************************************************************/ void do_ac_timer(void) { - int cpu = smp_processor_id(); - unsigned long flags; - struct ac_timer *t; + int cpu = smp_processor_id(); + unsigned long flags; + struct ac_timer *t; + s_time_t diff, now = NOW(); + long max; spin_lock_irqsave(&ac_timers[cpu].lock, flags); do_timer_again: TRC(printk("ACT [%02d] do(): now=%lld\n", cpu, NOW())); - - /* Sanity: is the timer list empty? */ - if ( list_empty(&ac_timers[cpu].timers) ) - printk("ACT[%02d] do_ac_timer(): timer irq without timer\n", cpu); - -#ifdef AC_TIMER_STATS - { - s32 diff; - u32 i; - diff = ((s32)(NOW() - t->expires)) / 1000; /* delta in us */ - if (diff < -BUCKETS) - diff = -BUCKETS; - else if (diff > BUCKETS) - diff = BUCKETS; - act_stats[cpu].times[diff+BUCKETS]++; - act_stats[cpu].count++; - - if (act_stats[cpu].count >= 5000) { - printk("ACT Stats\n"); - for (i=0; i < 2*BUCKETS; i++) { - if (act_stats[cpu].times[i] != 0) - printk("ACT [%02d]: %3dus: %5d\n", - cpu,i-BUCKETS, act_stats[cpu].times[i]); - act_stats[cpu].times[i]=0; - } - act_stats[cpu].count = 0; - printk("\n"); - } + + /* Sanity: is the timer list empty? */ + if ( list_empty(&ac_timers[cpu].timers) ) { + /* + * XXX RN: This shouldn't happen, but does! Two possibilities: + * - Race condition between removing and reseting APIC + * - setting an APIC timeout value of 0 causes an immediate + * timer interrupt to fire. + * None of these should be critical! + */ + spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); + return; } -#endif /* Handle all timeouts in the near future. */ while ( !list_empty(&ac_timers[cpu].timers) ) { - t = list_entry(ac_timers[cpu].timers.next, - struct ac_timer, timer_list); + t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list); if ( t->expires > (NOW() + TIMER_SLOP) ) break; + + /* do some stats */ + diff = (now - t->expires); + if (diff > 0x7fffffff) diff = 0x7fffffff; /* THIS IS BAD! */ + max = perfc_valuea(ac_timer_max, cpu); + if (diff > max) perfc_seta(ac_timer_max, cpu, diff); + detach_ac_timer(t); spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); if ( t->function != NULL ) t->function(t->data); spin_lock_irqsave(&ac_timers[cpu].lock, flags); } - + /* If list not empty then reprogram timer to new head of list */ if ( !list_empty(&ac_timers[cpu].timers) ) { - t = list_entry(ac_timers[cpu].timers.next, - struct ac_timer, timer_list); + t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list); if ( t->expires > 0 ) { TRC(printk("ACT [%02d] do(): reprog timo=%lld\n",cpu,t->expires)); @@ -259,21 +243,23 @@ void do_ac_timer(void) goto do_timer_again; } } + } else { + reprogram_ac_timer((s_time_t) 0); } spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); TRC(printk("ACT [%02d] do(): end\n", cpu)); } -/* +/***************************************************************************** * debug dump_queue * arguments: queue head, name of queue - */ + *****************************************************************************/ static void dump_tqueue(struct list_head *queue, char *name) { struct list_head *list; int loop = 0; - struct ac_timer *t; + struct ac_timer *t; printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); @@ -288,19 +274,21 @@ static void dump_tqueue(struct list_head *queue, char *name) return; } - -static void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs) +void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs) { u_long flags; s_time_t now = NOW(); + int i; - printk("Dumping ac_timer queues for cpu 0: NOW=0x%08X%08X\n", + printk("Dumping ac_timer queues: NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); - - spin_lock_irqsave(&ac_timers[0].lock, flags); - dump_tqueue(&ac_timers[0].timers, "ac_time"); - spin_unlock_irqrestore(&ac_timers[0].lock, flags); - printk("\n"); + for (i = 0; i < smp_num_cpus; i++) { + printk("CPU[%02d] ", i); + spin_lock_irqsave(&ac_timers[i].lock, flags); + dump_tqueue(&ac_timers[i].timers, "ac_time"); + spin_unlock_irqrestore(&ac_timers[i].lock, flags); + printk("\n"); + } return; } @@ -316,6 +304,51 @@ void __init ac_timer_init(void) INIT_LIST_HEAD(&ac_timers[i].timers); spin_lock_init(&ac_timers[i].lock); } - - add_key_handler('a', dump_timerq, "dump ac_timer queues"); } + +/***************************************************************************** + * GRAVEYARD + *****************************************************************************/ + +#if 0 + +#ifdef AC_TIMER_STATS +#define BUCKETS 1000 +#define MAX_STATS +typedef struct act_stats_st +{ + u32 count; + u32 times[2*(BUCKETS)]; +} __cacheline_aligned act_stats_t; +static act_stats_t act_stats[NR_CPUS]; + +#endif + +#ifdef AC_TIMER_STATS + { + XXX this is at the wrong place + s32 diff; + u32 i; + diff = ((s32)(NOW() - t->expires)) / 1000; /* delta in us */ + if (diff < -BUCKETS) + diff = -BUCKETS; + else if (diff > BUCKETS) + diff = BUCKETS; + act_stats[cpu].times[diff+BUCKETS]++; + act_stats[cpu].count++; + + if (act_stats[cpu].count >= 5000) { + printk("ACT Stats\n"); + for (i=0; i < 2*BUCKETS; i++) { + if (act_stats[cpu].times[i] != 0) + printk("ACT [%02d]: %3dus: %5d\n", + cpu,i-BUCKETS, act_stats[cpu].times[i]); + act_stats[cpu].times[i]=0; + } + act_stats[cpu].count = 0; + printk("\n"); + } + } +#endif + +#endif /* 0 */ diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 8bb4fecab0..12fd4e7105 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -19,18 +19,18 @@ void add_key_handler(u_char key, key_handler *handler, char *desc) char *str; if(key_table[key].handler != NULL) - printk("Warning: overwriting handler for key 0x%x\n", key); + printk("Warning: overwriting handler for key 0x%x\n", key); key_table[key].handler = handler; str = key_table[key].desc; for(i = 0; i < STR_MAX; i++) { - if(*desc) - *str++ = *desc++; - else break; + if(*desc) + *str++ = *desc++; + else break; } if (i == STR_MAX) - key_table[key].desc[STR_MAX-1] = '\0'; + key_table[key].desc[STR_MAX-1] = '\0'; return; } @@ -47,10 +47,10 @@ void show_handlers(u_char key, void *dev_id, struct pt_regs *regs) printk("'%c' pressed -> showing installed handlers\n", key); for(i=0; i < KEY_MAX; i++) - if(key_table[i].handler) - printk(" key '%c' (ascii '%02x') => %s\n", - (i<33 || i>126)?(' '):(i),i, - key_table[i].desc); + if(key_table[i].handler) + printk(" key '%c' (ascii '%02x') => %s\n", + (i<33 || i>126)?(' '):(i),i, + key_table[i].desc); return; } @@ -94,36 +94,42 @@ void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs) p = &idle0_task; do { printk("Xen: DOM %d, CPU %d [has=%c], state = %s, " - "hyp_events = %08x\n", - p->domain, p->processor, p->has_cpu ? 'T':'F', - task_states[p->state], p->hyp_events); - s = p->shared_info; - if(!is_idle_task(p)) { - printk("Guest: events = %08lx, event_enable = %08lx\n", - s->events, s->events_enable); - printk("Notifying guest...\n"); - set_bit(_EVENT_DEBUG, &s->events); - } + "hyp_events = %08x\n", + p->domain, p->processor, p->has_cpu ? 'T':'F', + task_states[p->state], p->hyp_events); + s = p->shared_info; + if(!is_idle_task(p)) { + printk("Guest: events = %08lx, event_enable = %08lx\n", + s->events, s->events_enable); + printk("Notifying guest...\n"); + set_bit(_EVENT_DEBUG, &s->events); + } } while ( (p = p->next_task) != &idle0_task ); read_unlock_irqrestore(&tasklist_lock, flags); } +extern void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs); +extern void dump_runq(u_char key, void *dev_id, struct pt_regs *regs); + + void initialize_keytable() { int i; /* first initialize key handler table */ for(i = 0; i < KEY_MAX; i++) - key_table[i].handler = (key_handler *)NULL; - + key_table[i].handler = (key_handler *)NULL; + /* setup own handlers */ + add_key_handler('a', dump_timerq, "dump ac_timer queues"); add_key_handler('d', dump_registers, "dump registers"); - add_key_handler('h', show_handlers, "show this message"); + add_key_handler('h', show_handlers, "show this message"); add_key_handler('p', perfc_printall, "print performance counters"); add_key_handler('q', do_task_queues, "dump task queues + guest state"); - add_key_handler('R', halt_machine, "reboot machine ungracefully"); + add_key_handler('r', dump_runq, "dump run queue"); + add_key_handler('R', halt_machine, "reboot machine ungracefully"); return; } diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 2f4ba31c32..ce46069167 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -40,8 +40,8 @@ #endif -#define MCU (s32)MICROSECS(100) /* Minimum unit */ -#define CTX_ALLOW (s32)MILLISECS(10) /* context switch allowance */ +#define MCU (s32)MICROSECS(100) /* Minimum unit */ +static s32 ctx_allow=(s32)MILLISECS(10); /* context switch allowance */ /***************************************************************************** * per CPU data for the scheduler. @@ -50,15 +50,15 @@ typedef struct schedule_data_st { spinlock_t lock; /* lock for protecting this */ struct list_head runqueue; /* runqueue */ - struct task_struct *prev, *curr; /* dito */ - - long svt; /* system virtual time. per CPU??? */ - struct ac_timer s_timer; /* scheduling timer */ + struct task_struct *prev, *curr; /* previous and current task */ + struct task_struct *idle; /* idle task for this cpu */ + u32 svt; /* system virtual time. per CPU??? */ + struct ac_timer s_timer; /* scheduling timer */ } __cacheline_aligned schedule_data_t; schedule_data_t schedule_data[NR_CPUS]; -struct ac_timer v_timer; /* scheduling timer */ +struct ac_timer v_timer; /* scheduling timer */ static void virt_timer(unsigned long foo); @@ -68,7 +68,7 @@ static void virt_timer(unsigned long foo); /* add a task to the head of the runqueue */ static inline void __add_to_runqueue_head(struct task_struct * p) { - + list_add(&p->run_list, &schedule_data[p->processor].runqueue); } /* add a task to the tail of the runqueue */ @@ -97,11 +97,19 @@ static inline int __task_on_runqueue(struct task_struct *p) ******************************************************************************/ void sched_add_domain(struct task_struct *p) { - p->state = TASK_UNINTERRUPTIBLE; - /* set avt end evt to system virtual time */ - p->avt = schedule_data[p->processor].svt; - p->evt = schedule_data[p->processor].svt; - /* RN: XXX BVT fill in other bits */ + p->state = TASK_UNINTERRUPTIBLE; + p->mcu_advance = 10; + + if (p->domain == IDLE_DOMAIN_ID) { + p->avt = 0xffffffff; + p->evt = 0xffffffff; + schedule_data[p->processor].idle = p; + } else { + /* set avt end evt to system virtual time */ + p->avt = schedule_data[p->processor].svt; + p->evt = schedule_data[p->processor].svt; + /* RN: XXX BVT fill in other bits */ + } } void sched_rem_domain(struct task_struct *p) @@ -117,16 +125,20 @@ int wake_up(struct task_struct *p) { unsigned long flags; int ret = 0; + spin_lock_irqsave(&schedule_data[p->processor].lock, flags); + if ( __task_on_runqueue(p) ) goto out; - p->state = TASK_RUNNING; - /* set the BVT parameters */ - if (p->avt < schedule_data[p->processor].svt) - p->avt = schedule_data[p->processor].svt; - p->evt = p->avt; /* RN: XXX BVT deal with warping here */ - + p->state = TASK_RUNNING; __add_to_runqueue_head(p); + + /* set the BVT parameters */ + if (p->avt < schedule_data[p->processor].svt) + p->avt = schedule_data[p->processor].svt; + + p->evt = p->avt; /* RN: XXX BVT deal with warping here */ + ret = 1; out: @@ -134,30 +146,56 @@ int wake_up(struct task_struct *p) return ret; } -/* RN: XXX turn this into do_halt() */ /**************************************************************************** * Domain requested scheduling operations ****************************************************************************/ long do_sched_op(void) { + /* XXX implement proper */ current->state = TASK_INTERRUPTIBLE; schedule(); return 0; } +/**************************************************************************** + * Control the scheduler + ****************************************************************************/ +long sched_bvtctl(unsigned long c_allow) +{ + printk("sched: bvtctl %lu\n", c_allow); + ctx_allow = c_allow; + return 0; +} + /**************************************************************************** * Adjust scheduling parameter for a given domain ****************************************************************************/ long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, - unsigned long warpl, unsigned long warpu) + unsigned long warpl, unsigned long warpu) { - printk("sched: adjdom %02d %lu %lu %lu %lu\n", - dom, mcu_adv, warp, warpl, warpu); - return 0; + struct task_struct *p; + + printk("sched: adjdom %02d %lu %lu %lu %lu\n", + dom, mcu_adv, warp, warpl, warpu); + + p = find_domain_by_id(dom); + if ( p == NULL ) return -ESRCH; + + spin_lock_irq(&schedule_data[p->processor].lock); + + p->mcu_advance = mcu_adv; + + spin_unlock_irq(&schedule_data[p->processor].lock); + + return 0; } /**************************************************************************** * cause a run through the scheduler when appropriate + * Appropriate is: + * - current task is idle task + * - new processes evt is lower than current one + * - the current task already ran for it's context switch allowance ****************************************************************************/ void reschedule(struct task_struct *p) { @@ -166,16 +204,20 @@ void reschedule(struct task_struct *p) unsigned long flags; if (p->has_cpu) - return; + return; spin_lock_irqsave(&schedule_data[cpu].lock, flags); curr = schedule_data[cpu].curr; - if (is_idle_task(curr)) { + + if ( is_idle_task(curr) || + (p->evt < curr->evt) || + (curr->lastschd + ctx_allow >= NOW()) ) { + /* reschedule */ set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); spin_unlock_irqrestore(&schedule_data[cpu].lock, flags); #ifdef CONFIG_SMP if (cpu != smp_processor_id()) - smp_send_event_check_cpu(cpu); + smp_send_event_check_cpu(cpu); #endif } else { spin_unlock_irqrestore(&schedule_data[cpu].lock, flags); @@ -194,27 +236,26 @@ asmlinkage void schedule(void) { struct task_struct *prev, *next, *next_prime, *p; struct list_head *tmp; - int this_cpu; - s_time_t now; - s32 r_time; /* time for new dom to run */ - s32 ranfor; /* assume we never run longer than 2.1s! */ - s32 mcus; - u32 next_evt, next_prime_evt; - - perfc_incrc(sched_run1); + int this_cpu; + s_time_t now; + s32 r_time; /* time for new dom to run */ + s32 ranfor; /* assume we never run longer than 2.1s! */ + s32 mcus; + u32 next_evt, next_prime_evt, min_avt; + + perfc_incrc(sched_run1); need_resched_back: - perfc_incrc(sched_run2); - - now = NOW(); - - /* remove timer */ - rem_ac_timer(&schedule_data[smp_processor_id()].s_timer); + perfc_incrc(sched_run2); + now = NOW(); next = NULL; prev = current; this_cpu = prev->processor; - /* + /* remove timer */ + rem_ac_timer(&schedule_data[this_cpu].s_timer); + + /* * deschedule the current domain */ @@ -223,95 +264,115 @@ asmlinkage void schedule(void) ASSERT(!in_interrupt()); ASSERT(__task_on_runqueue(prev)); - if (is_idle_task(prev)) - goto deschedule_done; + if (is_idle_task(prev)) + goto deschedule_done; - /* do some accounting */ - ranfor = (s32)(now - prev->lastschd); + /* do some accounting */ + ranfor = (s32)(now - prev->lastschd); ASSERT((ranfor>0)); - prev->cpu_time += ranfor; - - /* calculate mcu and update avt */ - mcus = ranfor/MCU; - if (ranfor % MCU) mcus ++; /* always round up */ - prev->avt += mcus * prev->mcu_advance; - prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */ - - /* dequeue */ - __del_from_runqueue(prev); - switch (prev->state) { - case TASK_INTERRUPTIBLE: - if (signal_pending(prev)) { - prev->state = TASK_RUNNING; /* but has events pending */ - break; - } - case TASK_UNINTERRUPTIBLE: - case TASK_WAIT: - case TASK_DYING: - default: - /* done if not running. Else, continue */ - goto deschedule_done; - case TASK_RUNNING:; - } - - /* requeue */ - __add_to_runqueue_tail(prev); - + prev->cpu_time += ranfor; + + /* calculate mcu and update avt */ + mcus = ranfor/MCU; + if (ranfor % MCU) mcus ++; /* always round up */ + prev->avt += mcus * prev->mcu_advance; + prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */ + + /* dequeue */ + __del_from_runqueue(prev); + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (signal_pending(prev)) { + prev->state = TASK_RUNNING; /* but has events pending */ + break; + } + case TASK_UNINTERRUPTIBLE: + case TASK_WAIT: + case TASK_DYING: + default: + /* done if not running. Else, continue */ + goto deschedule_done; + case TASK_RUNNING:; + } + + /* requeue */ + __add_to_runqueue_tail(prev); + deschedule_done: clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events); - /* + /* * Pick a new domain */ - /* we should at least have the idle task */ - ASSERT(!list_empty(&schedule_data[smp_processor_id()].runqueue)); + /* we should at least have the idle task */ + ASSERT(!list_empty(&schedule_data[this_cpu].runqueue)); - /* + /* * scan through the run queue and pick the task with the lowest evt * *and* the task the second lowest evt. - * this code is O(n) but we expect n to be small. + * this code is O(n) but we expect n to be small. */ - next = NULL; - next_prime = NULL; - - next_evt = 0xffffffff; - next_prime_evt = 0xffffffff; - - list_for_each(tmp, &schedule_data[smp_processor_id()].runqueue) { - p = list_entry(tmp, struct task_struct, run_list); - if (p->evt < next_evt) { - next_prime = next; - next_prime_evt = next_evt; - next = p; - next_evt = p->evt; - } - } - ASSERT(next != NULL); /* we should have at least the idle task */ - - if (next == NULL || is_idle_task(next)) { - next = &idle0_task; /* to be sure */ - r_time = CTX_ALLOW; - goto sched_done; - } - - if (next_prime == NULL || is_idle_task(next_prime)) { - /* we have only one runable task besides the idle task */ - r_time = CTX_ALLOW; /* RN: XXX should be much larger */ - goto sched_done; - } - - /* + next = schedule_data[this_cpu].idle; + next_prime = NULL; + + next_evt = 0xffffffff; + next_prime_evt = 0xffffffff; + min_avt = 0xffffffff; /* to calculate svt */ + + + list_for_each(tmp, &schedule_data[this_cpu].runqueue) { + p = list_entry(tmp, struct task_struct, run_list); + if (p->evt < next_evt) { + next_prime = next; + next_prime_evt = next_evt; + next = p; + next_evt = p->evt; + } else if (next_prime_evt == 0xffffffff) { + next_prime_evt = p->evt; + next_prime = p; + } else if (p->evt < next_prime_evt) { + next_prime_evt = p->evt; + next_prime = p; + } + /* determine system virtual time */ + if (p->avt < min_avt) + min_avt = p->avt; + } + ASSERT(next != NULL); /* we should have at least the idle task */ + + /* update system virtual time */ + if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt; + + if (is_idle_task(next)) { + r_time = ctx_allow; + goto sched_done; + } + + if (next_prime == NULL || is_idle_task(next_prime)) { + /* we have only one runable task besides the idle task */ + r_time = 10 * ctx_allow; /* RN: random constant */ + goto sched_done; + } + + /* * if we are here we have two runable tasks. - * work out how long 'next' can run till its evt is greater than + * work out how long 'next' can run till its evt is greater than * 'next_prime's evt. Taking context switch allowance into account. */ - r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + CTX_ALLOW; + ASSERT(next_prime->evt > next->evt); + r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow; sched_done: - ASSERT(r_time != 0); - ASSERT(r_time > 0); + ASSERT(r_time != 0); + ASSERT(r_time > ctx_allow); + + if ( (r_time==0) || (r_time < ctx_allow)) { + printk("[%02d]: %lx\n", this_cpu, r_time); + dump_rqueue(&schedule_data[this_cpu].runqueue, "foo"); + } + prev->has_cpu = 0; next->has_cpu = 1; @@ -319,16 +380,16 @@ asmlinkage void schedule(void) schedule_data[this_cpu].prev = prev; schedule_data[this_cpu].curr = next; - next->lastschd = now; + next->lastschd = now; - /* reprogramm the timer */ + /* reprogramm the timer */ timer_redo: - schedule_data[this_cpu].s_timer.expires = now + r_time; - if (add_ac_timer(&schedule_data[this_cpu].s_timer) == 1) { - printk("SCHED: Shit this shouldn't happen\n"); - now = NOW(); - goto timer_redo; - } + schedule_data[this_cpu].s_timer.expires = now + r_time; + if (add_ac_timer(&schedule_data[this_cpu].s_timer) == 1) { + printk("SCHED[%02d]: Shit this shouldn't happen\n", this_cpu); + now = NOW(); + goto timer_redo; + } spin_unlock_irq(&schedule_data[this_cpu].lock); @@ -339,6 +400,8 @@ asmlinkage void schedule(void) goto same_process; } + perfc_incrc(sched_ctx); + prepare_to_switch(); switch_to(prev, next); prev = schedule_data[this_cpu].prev; @@ -347,12 +410,12 @@ asmlinkage void schedule(void) if ( prev->state == TASK_DYING ) release_task(prev); same_process: - /* update the domains notion of time */ + /* update the domains notion of time */ update_dom_time(current->shared_info); if ( test_bit(_HYP_EVENT_NEED_RESCHED, ¤t->hyp_events) ) { goto need_resched_back; - } + } return; } @@ -361,11 +424,11 @@ asmlinkage void schedule(void) */ static void sched_timer(unsigned long foo) { - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct task_struct *curr = schedule_data[cpu].curr; - /* cause a reschedule */ - set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); - perfc_incrc(sched_irq); + /* cause a reschedule */ + set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); + perfc_incrc(sched_irq); } /* @@ -373,23 +436,23 @@ static void sched_timer(unsigned long foo) */ static void virt_timer(unsigned long foo) { - unsigned long cpu_mask = 0; - struct task_struct *p; - s_time_t now; - int res; - - /* send virtual timer interrupt */ - read_lock(&tasklist_lock); - p = &idle0_task; - do { - if ( is_idle_task(p) ) continue; - cpu_mask |= mark_guest_event(p, _EVENT_TIMER); - } - while ( (p = p->next_task) != &idle0_task ); - read_unlock(&tasklist_lock); - guest_event_notify(cpu_mask); - - again: + unsigned long cpu_mask = 0; + struct task_struct *p; + s_time_t now; + int res; + + /* send virtual timer interrupt */ + read_lock(&tasklist_lock); + p = &idle0_task; + do { + if ( is_idle_task(p) ) continue; + cpu_mask |= mark_guest_event(p, _EVENT_TIMER); + } + while ( (p = p->next_task) != &idle0_task ); + read_unlock(&tasklist_lock); + guest_event_notify(cpu_mask); + + again: now = NOW(); v_timer.expires = now + MILLISECS(10); res=add_ac_timer(&v_timer); @@ -412,14 +475,15 @@ void __init scheduler_init(void) spin_lock_init(&schedule_data[i].lock); schedule_data[i].prev = &idle0_task; schedule_data[i].curr = &idle0_task; - + /* a timer for each CPU */ init_ac_timer(&schedule_data[i].s_timer); schedule_data[i].s_timer.function = &sched_timer; } - init_ac_timer(&v_timer); - v_timer.function = &virt_timer; + schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */ + init_ac_timer(&v_timer); + v_timer.function = &virt_timer; } /* @@ -427,46 +491,14 @@ void __init scheduler_init(void) * This has to be done *after* the timers, e.g., APICs, have been initialised */ void schedulers_start(void) -{ +{ printk("Start schedulers\n"); __cli(); sched_timer(0); - virt_timer(0); + virt_timer(0); smp_call_function((void *)sched_timer, NULL, 1, 1); __sti(); - - //add_key_handler('r', dump_run_queues, "dump run queues") } -#if 0 -/**************************************************************************** - * Debugging functions - ****************************************************************************/ -static void dump_run_queues(u_char key, void *dev_id, struct pt_regs *regs) -{ - u_long flags; - struct task_struct *p; - shared_info_t *s; - - printk("'%c' pressed -> dumping run queues\n", key); - read_lock_irqsave(&tasklist_lock, flags); - p = &idle0_task; - do { - printk("Xen: DOM %d, CPU %d [has=%c], state = %s, " - "hyp_events = %08x\n", - p->domain, p->processor, p->has_cpu ? 'T':'F', - task_states[p->state], p->hyp_events); - s = p->shared_info; - if(!is_idle_task(p)) { - printk("Guest: events = %08lx, event_enable = %08lx\n", - s->events, s->events_enable); - printk("Notifying guest...\n"); - set_bit(_EVENT_DEBUG, &s->events); - } - } while ( (p = p->next_task) != &idle0_task ); - - read_unlock_irqrestore(&tasklist_lock, flags); -} -#endif /**************************************************************************** @@ -533,3 +565,47 @@ long schedule_timeout(long timeout) out: return timeout < 0 ? 0 : timeout; } + +/**************************************************************************** + * debug function + ****************************************************************************/ + +static void dump_rqueue(struct list_head *queue, char *name) +{ + struct list_head *list; + int loop = 0; + struct task_struct *p; + + printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue, + (unsigned long) queue->next, (unsigned long) queue->prev); + list_for_each (list, queue) { + p = list_entry(list, struct task_struct, run_list); + printk("%3d: %3d has=%c mcua=0x%04X ev=0x%08X av=0x%08X c=0x%X%08X\n", + loop++, p->domain, + p->has_cpu ? 'T':'F', + p->mcu_advance, p->evt, p->avt, + (u32)(p->cpu_time>>32), (u32)p->cpu_time); + printk(" l: %lx n: %lx p: %lx\n", + (unsigned long)list, (unsigned long)list->next, + (unsigned long)list->prev); + } + return; +} + +void dump_runq(u_char key, void *dev_id, struct pt_regs *regs) +{ + u_long flags; + s_time_t now = NOW(); + int i; + + printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n", + (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); + for (i = 0; i < smp_num_cpus; i++) { + spin_lock_irqsave(&schedule_data[i].lock, flags); + printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt); + dump_rqueue(&schedule_data[i].runqueue, "rq"); + spin_unlock_irqrestore(&schedule_data[i].lock, flags); + } + return; +} + diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h index b636c36f31..dbbf6a927e 100644 --- a/xen/include/xeno/sched.h +++ b/xen/include/xeno/sched.h @@ -63,20 +63,20 @@ extern struct mm_struct init_mm; struct task_struct { - /* + /* * DO NOT CHANGE THE ORDER OF THE FOLLOWING. * There offsets are hardcoded in entry.S */ int processor; /* 00: current processor */ - int state; /* 04: current run state */ - int hyp_events; /* 08: pending events */ + int state; /* 04: current run state */ + int hyp_events; /* 08: pending events */ unsigned int domain; /* 12: domain id */ /* An unsafe pointer into a shared data area. */ shared_info_t *shared_info; /* 16: shared data area */ - /* + /* * From here on things can be added and shuffled without special attention */ @@ -84,25 +84,25 @@ struct task_struct { unsigned int tot_pages; /* number of pages currently possesed */ unsigned int max_pages; /* max number of pages that can be possesed */ - /* scheduling */ - struct list_head run_list; /* the run list */ - int has_cpu; - int policy; - int counter; + /* scheduling */ + struct list_head run_list; /* the run list */ + int has_cpu; + int policy; + int counter; - struct ac_timer blt; /* blocked timeout */ + struct ac_timer blt; /* blocked timeout */ - s_time_t lastschd; /* time this domain was last scheduled */ - s_time_t cpu_time; /* total CPU time received till now */ + s_time_t lastschd; /* time this domain was last scheduled */ + s_time_t cpu_time; /* total CPU time received till now */ - long mcu_advance; /* inverse of weight */ - u32 avt; /* actual virtual time */ - u32 evt; /* effective virtual time */ - long warp; /* virtual time warp */ - long warpl; /* warp limit */ - long warpu; /* unwarp time requirement */ - long warped; /* time it ran warped last time */ - long uwarped; /* time it ran unwarped last time */ + unsigned long mcu_advance; /* inverse of weight */ + s32 avt; /* actual virtual time */ + s32 evt; /* effective virtual time */ + long warp; /* virtual time warp */ + long warpl; /* warp limit */ + long warpu; /* unwarp time requirement */ + long warped; /* time it ran warped last time */ + long uwarped; /* time it ran unwarped last time */ /* Network I/O */ @@ -119,7 +119,7 @@ struct task_struct { segment_t *segment_list[XEN_MAX_SEGMENTS]; /* vhd */ int segment_count; - /* VM */ + /* VM */ struct mm_struct mm; /* We need this lock to check page types and frob reference counts. */ spinlock_t page_lock; @@ -158,7 +158,7 @@ struct task_struct { #define TASK_RUNNING 0 #define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 -#define TASK_WAIT 4 +#define TASK_WAIT 4 #define TASK_DYING 16 /* #define TASK_STOPPED 8 not really used */ @@ -172,8 +172,8 @@ struct task_struct { domain: IDLE_DOMAIN_ID, \ state: TASK_RUNNING, \ has_cpu: 0, \ - evt: 0x7fffffff, \ - avt: 0x7fffffff, \ + evt: 0xffffffff, \ + avt: 0xffffffff, \ mm: IDLE0_MM, \ addr_limit: KERNEL_DS, \ active_mm: &idle0_task.mm, \ @@ -186,7 +186,7 @@ struct task_struct { #define is_idle_task(_p) ((_p)->domain == IDLE_DOMAIN_ID) #ifndef IDLE0_TASK_SIZE -#define IDLE0_TASK_SIZE 2048*sizeof(long) +#define IDLE0_TASK_SIZE 2048*sizeof(long) #endif union task_union { @@ -235,8 +235,9 @@ void scheduler_init(void); void schedulers_start(void); void sched_add_domain(struct task_struct *p); void sched_rem_domain(struct task_struct *p); +long sched_bvtctl(unsigned long ctx_allow); long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, - unsigned long warpl, unsigned long warpu); + unsigned long warpl, unsigned long warpu); int wake_up(struct task_struct *p); long schedule_timeout(long timeout); long do_yield(void); -- cgit v1.2.3