diff options
-rwxr-xr-x | .bk-to-hg | 2 | ||||
-rwxr-xr-x | .hg-to-bk | 3 | ||||
-rw-r--r-- | .rootkeys | 3 | ||||
-rw-r--r-- | tools/domain_builder/dom0_ops.h | 81 | ||||
-rw-r--r-- | xen/arch/i386/apic.c | 8 | ||||
-rw-r--r-- | xen/common/ac_timer.c | 166 | ||||
-rw-r--r-- | xen/common/dom0_ops.c | 28 | ||||
-rw-r--r-- | xen/common/domain.c | 18 | ||||
-rw-r--r-- | xen/common/keyhandler.c | 15 | ||||
-rw-r--r-- | xen/common/schedule.c | 556 | ||||
-rw-r--r-- | xen/include/xeno/ac_timer.h | 10 | ||||
-rw-r--r-- | xen/include/xeno/dom0_ops.h | 21 | ||||
-rw-r--r-- | xen/include/xeno/perfc_defn.h | 13 | ||||
-rw-r--r-- | xen/include/xeno/sched.h | 58 | ||||
-rw-r--r-- | xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile | 2 | ||||
-rw-r--r-- | xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h | 33 | ||||
-rw-r--r-- | xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c | 137 |
17 files changed, 814 insertions, 340 deletions
@@ -2,5 +2,7 @@ set -e test -L old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs rm old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs +test -L tools/domain_builder/dom0_ops.h +rm tools/domain_builder/dom0_ops.h (find -depth -type d -print | xargs -r rmdir 2>/dev/null) || true exit 0 @@ -5,5 +5,8 @@ mkdir -p old/xenolinux-2.4.16-sparse mkdir -p old/xenolinux-2.4.16-sparse/include mkdir -p old/xenolinux-2.4.16-sparse/include/asm-xeno ln -s ../../../xen-2.4.16/include/hypervisor-ifs old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs +mkdir -p tools +mkdir -p tools/domain_builder +ln -s ../../xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h tools/domain_builder/dom0_ops.h (find -depth -type d -print | xargs -r rmdir 2>/dev/null) || true exit 0 @@ -182,7 +182,7 @@ 3e4d00468-FN2VDeEHo96zxrMHK_mA tools/domain_builder/Makefile 3e4d0046SPau_y0sw2WLJz8QkqNoRA tools/domain_builder/README 3e4d0046bbdH0GsI9J_1Eb4ZQHfIiQ tools/domain_builder/dom0_defs.h -3e4d0046RgYCfGOw6qGz_7kYLMV2Vw tools/domain_builder/dom0_ops.h +3e71f9b871pvOAxDrhxpC4N4mHkbww tools/domain_builder/dom0_ops.h 3e4d0046ouLij_CMN_j7-dUHZIBI_A tools/domain_builder/dom_builder.c 3e4d0046EKs06fY0CWDEgZQcn7DYUg tools/domain_builder/dom_kill.c 3e4d0046aPbGiRTtdWxqY5b3ytWurA tools/domain_builder/hypervisor_defs.h @@ -487,6 +487,7 @@ 3e5a4e65BXtftInNHUC2PjDfPhdZZA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_core.c 3e5a4e65uXAx05p6B1-HU2tijuw8qA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_memory.c 3e5a4e65EOOLlPwXnhSuX-iVdWLmnA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h +3e6dba59C8o0kBks7UZ4IW_FY853Aw xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c 3e5a4e65gfn_ltB8ujHMVFApnTTNRQ xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/vfr.c 3e5a4e65gZBRBB6RsSVg1c9iahigAw xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/Makefile 3e5a4e65ZxKrbFetVB84JhrTyZ1YuQ xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c diff --git a/tools/domain_builder/dom0_ops.h b/tools/domain_builder/dom0_ops.h deleted file mode 100644 index 6c60a93ff6..0000000000 --- a/tools/domain_builder/dom0_ops.h +++ /dev/null @@ -1,81 +0,0 @@ -/****************************************************************************** - * dom0_ops.h - * - * Process command requests from domain-0 guest OS. - * - * Copyright (c) 2002, K A Fraser, B Dragovic - */ - -#define DOM0_NEWDOMAIN 0 -#define DOM0_KILLDOMAIN 1 -#define DOM0_GETMEMLIST 2 -#define DOM0_STARTDOM 4 -#define MAP_DOM_MEM 6 /* Not passed down to Xen */ -#define DO_PGUPDATES 7 /* Not passed down to Xen */ -#define MAX_CMD 8 - -#define MAX_CMD_LEN 256 - -typedef struct dom0_newdomain_st -{ - unsigned int domain; - unsigned int memory_kb; - unsigned int num_vifs; // temporary - unsigned long pg_head; // return parameter -} dom0_newdomain_t; - -typedef struct dom0_killdomain_st -{ - unsigned int domain; - int force; -} dom0_killdomain_t; - -typedef struct dom0_getmemlist_st -{ - unsigned long start_pfn; - unsigned long num_pfns; - void *buffer; -} dom0_getmemlist_t; - -/* This is entirely processed by XenoLinux */ -typedef struct dom_mem -{ - unsigned int domain; - unsigned long vaddr; - unsigned long start_pfn; - int tot_pages; -} dom_mem_t; - -/* This is entirely processed by XenoLinux */ -typedef struct dom_pgupdate -{ - unsigned long pgt_update_arr; - unsigned long num_pgt_updates; -} dom_pgupdate_t; - -typedef struct domain_launch -{ - unsigned int domain; - unsigned long l2_pgt_addr; - unsigned long virt_load_addr; - unsigned long virt_shinfo_addr; - unsigned long virt_startinfo_addr; - unsigned int num_vifs; - char cmd_line[MAX_CMD_LEN]; -} dom_meminfo_t; - -typedef struct dom0_op_st -{ - unsigned long cmd; - union - { - dom0_newdomain_t newdomain; - dom0_killdomain_t killdomain; - dom0_getmemlist_t getmemlist; - dom_mem_t dommem; - dom_pgupdate_t pgupdate; - dom_meminfo_t meminfo; - } - u; -} dom0_op_t; - diff --git a/xen/arch/i386/apic.c b/xen/arch/i386/apic.c index 865a279d8c..0acf7067c3 100644 --- a/xen/arch/i386/apic.c +++ b/xen/arch/i386/apic.c @@ -659,6 +659,13 @@ int reprogram_ac_timer(s_time_t timeout) s_time_t expire; u64 apic_tmict; + if (timeout == 0) { + /* XXX RN: not sure if this disables it or cause interruptto + * go off imediately */ + apic_tmict = 0; + goto reprogram; + } + now = NOW(); expire = timeout - now; /* value from now */ @@ -680,6 +687,7 @@ int reprogram_ac_timer(s_time_t timeout) return 0; } + reprogram: /* programm timer */ apic_write(APIC_TMICT, (unsigned long)apic_tmict); diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c index dc70de4e0c..73ac893e08 100644 --- a/xen/common/ac_timer.c +++ b/xen/common/ac_timer.c @@ -23,7 +23,6 @@ #include <xeno/errno.h> #include <xeno/sched.h> #include <xeno/lib.h> -#include <xeno/config.h> #include <xeno/smp.h> #include <xeno/perfc.h> @@ -41,10 +40,10 @@ #define TRC(_x) #endif -/* +/***************************************************************************** * We pull handlers off the timer list this far in future, * rather than reprogramming the time hardware. - */ + *****************************************************************************/ #define TIMER_SLOP (50*1000) /* ns */ /* A timer list per CPU */ @@ -58,30 +57,29 @@ static ac_timers_t ac_timers[NR_CPUS]; /* local prototypes */ static int detach_ac_timer(struct ac_timer *timer); -/*static void ac_timer_debug(unsigned long);*/ -/* + +/***************************************************************************** * add a timer. * return value: * 0: success * 1: failure, timer in the past or timeout value to small * -1: failure, timer uninitialised * fail - */ + *****************************************************************************/ int add_ac_timer(struct ac_timer *timer) { - int cpu = smp_processor_id(); - unsigned long flags; - s_time_t now; + int cpu = smp_processor_id(); + unsigned long flags; + s_time_t now; /* make sure timeout value is in the future */ + now = NOW(); - TRC(printk("ACT [%02d] add(): now=%lld timo=%lld\n", - cpu, now, timer->expires)); - if (timer->expires <= now) { - printk("ACT[%02d] add_ac_timer: now=0x%08X%08X > expire=0x%08X%08X\n", - cpu, (u32)(now>>32), (u32)now, - (u32)(timer->expires>>32), (u32)timer->expires); + if (timer->expires <= now) { + TRC(printk("ACT[%02d] add_ac_timer:now=0x%08X%08X>expire=0x%08X%08X\n", + cpu, (u32)(now>>32), (u32)now, + (u32)(timer->expires>>32), (u32)timer->expires)); return 1; } spin_lock_irqsave(&ac_timers[cpu].lock, flags); @@ -90,71 +88,57 @@ int add_ac_timer(struct ac_timer *timer) * reprogramm the timer */ if (list_empty(&ac_timers[cpu].timers)) { - /* Reprogramm and add to head of list */ if (!reprogram_ac_timer(timer->expires)) { - /* failed */ - printk("ACT [%02d] add(): add at head failed\n", cpu); + printk("ACT[%02d] add at head failed\n", cpu); spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); - return 1; + return 1; /* failed */ } list_add(&timer->timer_list, &ac_timers[cpu].timers); - TRC(printk("ACT [%02d] add(0x%08X%08X): added at head\n", cpu, - (u32)(timer->expires>>32), (u32)timer->expires)); } else { struct list_head *pos; - struct ac_timer *t; - for (pos = ac_timers[cpu].timers.next; - pos != &ac_timers[cpu].timers; - pos = pos->next) { + struct ac_timer *t; + + list_for_each(pos, &ac_timers[cpu].timers) { t = list_entry(pos, struct ac_timer, timer_list); if (t->expires > timer->expires) break; } + list_add (&(timer->timer_list), pos->prev); - if (pos->prev == &ac_timers[cpu].timers) { - /* added to head, reprogramm timer */ + if (timer->timer_list.prev == &ac_timers[cpu].timers) { + /* added at head */ if (!reprogram_ac_timer(timer->expires)) { - /* failed */ - TRC(printk("ACT [%02d] add(): add at head failed\n", cpu)); + printk("ACT[%02d] add at head failed\n", cpu); + detach_ac_timer(timer); spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); - return 1; + return 1; /* failed */ } - list_add (&(timer->timer_list), pos->prev); - TRC(printk("ACT [%02d] add(0x%08X%08X): added at head\n", cpu, - (u32)(timer->expires>>32), (u32)timer->expires)); - } else { - list_add (&(timer->timer_list), pos->prev); - TRC(printk("ACT [%02d] add(0x%08X%08X): add < exp=0x%08X%08X\n", - cpu, - (u32)(timer->expires>>32), (u32)timer->expires, - (u32)(t->expires>>32), (u32)t->expires)); } } spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); return 0; } -/* - * remove a timer +/***************************************************************************** + * detach a timer (no locking) * return values: * 0: success * -1: bogus timer - */ + *****************************************************************************/ static int detach_ac_timer(struct ac_timer *timer) { - TRC(int cpu = smp_processor_id()); TRC(printk("ACT [%02d] detach(): \n", cpu)); list_del(&timer->timer_list); timer->timer_list.next = NULL; return 0; } -/* +/***************************************************************************** * remove a timer * return values: * 0: success * -1: bogus timer - */ + *****************************************************************************/ int rem_ac_timer(struct ac_timer *timer) { int cpu = smp_processor_id(); @@ -163,19 +147,30 @@ int rem_ac_timer(struct ac_timer *timer) TRC(printk("ACT [%02d] remove(): timo=%lld \n", cpu, timer->expires)); spin_lock_irqsave(&ac_timers[cpu].lock, flags); - if (timer->timer_list.next) - res = detach_ac_timer(timer); + if (timer->timer_list.next) { + res = detach_ac_timer(timer); + + if (timer->timer_list.prev == &ac_timers[cpu].timers) { + /* just removed the head */ + if (list_empty(&ac_timers[cpu].timers)) { + reprogram_ac_timer((s_time_t) 0); + } + /* XXX should actaully reprogramm APIC to new head */ + } + } else + res = -1; + spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); return res; } -/* +/***************************************************************************** * modify a timer, i.e., set a new timeout value * return value: * 0: sucess * -1: error - */ + *****************************************************************************/ int mod_ac_timer(struct ac_timer *timer, s_time_t new_time) { if (rem_ac_timer(timer) != 0) @@ -186,10 +181,10 @@ int mod_ac_timer(struct ac_timer *timer, s_time_t new_time) return 0; } -/* +/***************************************************************************** * do_ac_timer * deal with timeouts and run the handlers - */ + *****************************************************************************/ void do_ac_timer(void) { int cpu = smp_processor_id(); @@ -206,15 +201,21 @@ void do_ac_timer(void) /* Sanity: is the timer list empty? */ if ( list_empty(&ac_timers[cpu].timers) ) { - printk("ACT[%02d] do_ac_timer(): timer irq without timer\n", cpu); + /* + * XXX RN: This shouldn't happen, but does! Two possibilities: + * - Race condition between removing and reseting APIC + * - setting an APIC timeout value of 0 causes an immediate + * timer interrupt to fire. + * None of these should be critical! + */ + spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); return; } /* Handle all timeouts in the near future. */ while ( !list_empty(&ac_timers[cpu].timers) ) { - t = list_entry(ac_timers[cpu].timers.next, - struct ac_timer, timer_list); + t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list); if ( t->expires > (NOW() + TIMER_SLOP) ) break; /* do some stats */ @@ -232,8 +233,7 @@ void do_ac_timer(void) /* If list not empty then reprogram timer to new head of list */ if ( !list_empty(&ac_timers[cpu].timers) ) { - t = list_entry(ac_timers[cpu].timers.next, - struct ac_timer, timer_list); + t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list); if ( t->expires > 0 ) { TRC(printk("ACT [%02d] do(): reprog timo=%lld\n",cpu,t->expires)); @@ -243,16 +243,18 @@ void do_ac_timer(void) goto do_timer_again; } } + } else { + reprogram_ac_timer((s_time_t) 0); } spin_unlock_irqrestore(&ac_timers[cpu].lock, flags); TRC(printk("ACT [%02d] do(): end\n", cpu)); } -/* +/***************************************************************************** * debug dump_queue * arguments: queue head, name of queue - */ + *****************************************************************************/ static void dump_tqueue(struct list_head *queue, char *name) { struct list_head *list; @@ -272,7 +274,6 @@ static void dump_tqueue(struct list_head *queue, char *name) return; } - void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs) { u_long flags; @@ -304,3 +305,50 @@ void __init ac_timer_init(void) spin_lock_init(&ac_timers[i].lock); } } + +/***************************************************************************** + * GRAVEYARD + *****************************************************************************/ + +#if 0 + +#ifdef AC_TIMER_STATS +#define BUCKETS 1000 +#define MAX_STATS +typedef struct act_stats_st +{ + u32 count; + u32 times[2*(BUCKETS)]; +} __cacheline_aligned act_stats_t; +static act_stats_t act_stats[NR_CPUS]; + +#endif + +#ifdef AC_TIMER_STATS + { + XXX this is at the wrong place + s32 diff; + u32 i; + diff = ((s32)(NOW() - t->expires)) / 1000; /* delta in us */ + if (diff < -BUCKETS) + diff = -BUCKETS; + else if (diff > BUCKETS) + diff = BUCKETS; + act_stats[cpu].times[diff+BUCKETS]++; + act_stats[cpu].count++; + + if (act_stats[cpu].count >= 5000) { + printk("ACT Stats\n"); + for (i=0; i < 2*BUCKETS; i++) { + if (act_stats[cpu].times[i] != 0) + printk("ACT [%02d]: %3dus: %5d\n", + cpu,i-BUCKETS, act_stats[cpu].times[i]); + act_stats[cpu].times[i]=0; + } + act_stats[cpu].count = 0; + printk("\n"); + } + } +#endif + +#endif /* 0 */ diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index e451a8f3e7..e6d54e9695 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -126,6 +126,34 @@ long do_dom0_op(dom0_op_t *u_dom0_op) } break; + case DOM0_BVTCTL: + { + unsigned long ctx_allow = op.u.bvtctl.ctx_allow; + ret = sched_bvtctl(ctx_allow); + + } + break; + + case DOM0_ADJUSTDOM: + { + unsigned int dom = op.u.adjustdom.domain; + unsigned long mcu_adv = op.u.adjustdom.mcu_adv; + unsigned long warp = op.u.adjustdom.warp; + unsigned long warpl = op.u.adjustdom.warpl; + unsigned long warpu = op.u.adjustdom.warpu; + + + if ( dom == IDLE_DOMAIN_ID ) + { + ret = -EPERM; + } + else + { + ret = sched_adjdom(dom, mcu_adv, warp, warpl, warpu); + } + } + break; + case DOM0_GETMEMLIST: { int i; diff --git a/xen/common/domain.c b/xen/common/domain.c index 32bf8b7172..5fc4304c01 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -58,10 +58,11 @@ struct task_struct *do_newdomain(unsigned int dom_id, unsigned int cpu) SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS); p->addr_limit = USER_DS; - p->state = TASK_UNINTERRUPTIBLE; p->active_mm = &p->mm; p->num_net_vifs = 0; + sched_add_domain(p); + p->net_ring_base = (net_ring_t *)(p->shared_info + 1); INIT_LIST_HEAD(&p->pg_head); p->max_pages = p->tot_pages = 0; @@ -115,7 +116,8 @@ void kill_domain(void) } printk("Killing domain %d\n", current->domain); - current->state = TASK_DYING; + + sched_rem_domain(current); schedule(); BUG(); /* never get here */ } @@ -293,7 +295,7 @@ int final_setup_guestos(struct task_struct * p, dom_meminfo_t * meminfo) /* set up the shared info structure */ update_dom_time(p->shared_info); - p->shared_info->cpu_freq = cpu_freq; + p->shared_info->cpu_freq = cpu_freq; p->shared_info->domain_time = 0; /* we pass start info struct to guest os as function parameter on stack */ @@ -516,8 +518,8 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params) unmap_domain_mem(l1start); /* Set up shared info area. */ - update_dom_time(p->shared_info); - p->shared_info->cpu_freq = cpu_freq; + update_dom_time(p->shared_info); + p->shared_info->cpu_freq = cpu_freq; p->shared_info->domain_time = 0; @@ -555,7 +557,7 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params) #define SHIP2GUEST(_x) (virt_shinfo_address | (((unsigned long)(_x)) & 0xFFF)) virt_startinfo_address->net_rings = - (net_ring_t *)SHIP2GUEST(p->net_ring_base); + (net_ring_t *)SHIP2GUEST(p->net_ring_base); virt_startinfo_address->num_net_rings = p->num_net_vifs; /* Add block io interface */ @@ -597,7 +599,5 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params) void __init domain_init(void) { - printk("Initialising domains\n"); + printk("Initialising domains\n"); } - - diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 19943fff3e..dde9e0ff10 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -1,9 +1,6 @@ #include <xeno/keyhandler.h> #include <xeno/reboot.h> -extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs); -extern void perfc_reset (u_char key, void *dev_id, struct pt_regs *regs); - #define KEY_MAX 256 #define STR_MAX 64 @@ -117,6 +114,12 @@ void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs) } +extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs); +extern void perfc_reset (u_char key, void *dev_id, struct pt_regs *regs); +extern void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs); +extern void dump_runq(u_char key, void *dev_id, struct pt_regs *regs); + + void initialize_keytable() { int i; @@ -126,13 +129,15 @@ void initialize_keytable() key_table[i].handler = (key_handler *)NULL; /* setup own handlers */ + add_key_handler('a', dump_timerq, "dump ac_timer queues"); add_key_handler('d', dump_registers, "dump registers"); add_key_handler('h', show_handlers, "show this message"); add_key_handler('p', perfc_printall, "print performance counters"); add_key_handler('P', perfc_reset, "reset performance counters"); add_key_handler('q', do_task_queues, "dump task queues + guest state"); - add_key_handler('B', kill_dom0, "reboot machine gracefully"); - add_key_handler('R', halt_machine, "reboot machine ungracefully"); + add_key_handler('r', dump_runq, "dump run queues"); + add_key_handler('B', kill_dom0, "reboot machine gracefully"); + add_key_handler('R', halt_machine, "reboot machine ungracefully"); return; } diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 787b43d900..ce46069167 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -11,7 +11,8 @@ * * Environment: Xen Hypervisor * Description: CPU scheduling - * partially moved from domain.c + * implements A Borrowed Virtual Time scheduler. + * (see Duda & Cheriton SOSP'99) * **************************************************************************** * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $ @@ -28,6 +29,9 @@ #include <xeno/ac_timer.h> #include <xeno/interrupt.h> +#include <xeno/perfc.h> + + #undef SCHEDULER_TRACE #ifdef SCHEDULER_TRACE #define TRC(_x) _x @@ -35,80 +39,106 @@ #define TRC(_x) #endif -/* + +#define MCU (s32)MICROSECS(100) /* Minimum unit */ +static s32 ctx_allow=(s32)MILLISECS(10); /* context switch allowance */ + +/***************************************************************************** * per CPU data for the scheduler. - */ + *****************************************************************************/ typedef struct schedule_data_st { - spinlock_t lock; - struct list_head runqueue; - struct task_struct *prev, *curr; + spinlock_t lock; /* lock for protecting this */ + struct list_head runqueue; /* runqueue */ + struct task_struct *prev, *curr; /* previous and current task */ + struct task_struct *idle; /* idle task for this cpu */ + u32 svt; /* system virtual time. per CPU??? */ + struct ac_timer s_timer; /* scheduling timer */ + } __cacheline_aligned schedule_data_t; schedule_data_t schedule_data[NR_CPUS]; -static __cacheline_aligned struct ac_timer s_timer[NR_CPUS]; +struct ac_timer v_timer; /* scheduling timer */ +static void virt_timer(unsigned long foo); -/* - * Some convenience functions - */ -static inline void __add_to_runqueue(struct task_struct * p) +/***************************************************************************** + * Some convenience functions + *****************************************************************************/ +/* add a task to the head of the runqueue */ +static inline void __add_to_runqueue_head(struct task_struct * p) { + list_add(&p->run_list, &schedule_data[p->processor].runqueue); } - -static inline void __move_last_runqueue(struct task_struct * p) +/* add a task to the tail of the runqueue */ +static inline void __add_to_runqueue_tail(struct task_struct * p) { - list_del(&p->run_list); list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue); } -static inline void __move_first_runqueue(struct task_struct * p) -{ - list_del(&p->run_list); - list_add(&p->run_list, &schedule_data[p->processor].runqueue); -} - +/* remove a task from runqueue */ static inline void __del_from_runqueue(struct task_struct * p) { list_del(&p->run_list); p->run_list.next = NULL; } - +/* is task on run queue? */ static inline int __task_on_runqueue(struct task_struct *p) { return (p->run_list.next != NULL); } +#define next_domain(p) \\ + list_entry((p)->run_list.next, struct task_struct, run_list) -/* - * Add a new domain to the scheduler - */ +/****************************************************************************** +* Add and remove a domain +******************************************************************************/ void sched_add_domain(struct task_struct *p) { - p->state = TASK_UNINTERRUPTIBLE; + p->state = TASK_UNINTERRUPTIBLE; + p->mcu_advance = 10; + + if (p->domain == IDLE_DOMAIN_ID) { + p->avt = 0xffffffff; + p->evt = 0xffffffff; + schedule_data[p->processor].idle = p; + } else { + /* set avt end evt to system virtual time */ + p->avt = schedule_data[p->processor].svt; + p->evt = schedule_data[p->processor].svt; + /* RN: XXX BVT fill in other bits */ + } } -/* - * Remove domain to the scheduler - */ void sched_rem_domain(struct task_struct *p) { p->state = TASK_DYING; } -/* +/**************************************************************************** * wake up a domain which had been sleeping - */ + ****************************************************************************/ int wake_up(struct task_struct *p) { unsigned long flags; int ret = 0; + spin_lock_irqsave(&schedule_data[p->processor].lock, flags); + if ( __task_on_runqueue(p) ) goto out; + p->state = TASK_RUNNING; - __add_to_runqueue(p); + __add_to_runqueue_head(p); + + /* set the BVT parameters */ + if (p->avt < schedule_data[p->processor].svt) + p->avt = schedule_data[p->processor].svt; + + p->evt = p->avt; /* RN: XXX BVT deal with warping here */ + ret = 1; out: @@ -116,75 +146,57 @@ int wake_up(struct task_struct *p) return ret; } -static void process_timeout(unsigned long __data) +/**************************************************************************** + * Domain requested scheduling operations + ****************************************************************************/ +long do_sched_op(void) { - struct task_struct * p = (struct task_struct *) __data; - wake_up(p); + /* XXX implement proper */ + current->state = TASK_INTERRUPTIBLE; + schedule(); + return 0; } -long schedule_timeout(long timeout) +/**************************************************************************** + * Control the scheduler + ****************************************************************************/ +long sched_bvtctl(unsigned long c_allow) { - struct timer_list timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable in the caller. - * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the - * negative value but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be 0 since no - * piece of kernel is supposed to do a check for a negative retval of - * schedule_timeout() (since it should never happens anyway). You just - * have the printk() that will tell you if something is gone wrong and - * where. - */ - if (timeout < 0) - { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); - current->state = TASK_RUNNING; - goto out; - } - } - - expire = timeout + jiffies; - - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; - - add_timer(&timer); - schedule(); - del_timer_sync(&timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; + printk("sched: bvtctl %lu\n", c_allow); + ctx_allow = c_allow; + return 0; } -/* RN: XXX turn this into do_halt() */ -/* - * yield the current process - */ -long do_sched_op(void) +/**************************************************************************** + * Adjust scheduling parameter for a given domain + ****************************************************************************/ +long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, + unsigned long warpl, unsigned long warpu) { - current->state = TASK_INTERRUPTIBLE; - schedule(); + struct task_struct *p; + + printk("sched: adjdom %02d %lu %lu %lu %lu\n", + dom, mcu_adv, warp, warpl, warpu); + + p = find_domain_by_id(dom); + if ( p == NULL ) return -ESRCH; + + spin_lock_irq(&schedule_data[p->processor].lock); + + p->mcu_advance = mcu_adv; + + spin_unlock_irq(&schedule_data[p->processor].lock); + return 0; } - +/**************************************************************************** + * cause a run through the scheduler when appropriate + * Appropriate is: + * - current task is idle task + * - new processes evt is lower than current one + * - the current task already ran for it's context switch allowance + ****************************************************************************/ void reschedule(struct task_struct *p) { int cpu = p->processor; @@ -192,16 +204,20 @@ void reschedule(struct task_struct *p) unsigned long flags; if (p->has_cpu) - return; + return; spin_lock_irqsave(&schedule_data[cpu].lock, flags); curr = schedule_data[cpu].curr; - if (is_idle_task(curr)) { + + if ( is_idle_task(curr) || + (p->evt < curr->evt) || + (curr->lastschd + ctx_allow >= NOW()) ) { + /* reschedule */ set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); spin_unlock_irqrestore(&schedule_data[cpu].lock, flags); #ifdef CONFIG_SMP if (cpu != smp_processor_id()) - smp_send_event_check_cpu(cpu); + smp_send_event_check_cpu(cpu); #endif } else { spin_unlock_irqrestore(&schedule_data[cpu].lock, flags); @@ -209,47 +225,154 @@ void reschedule(struct task_struct *p) } -/* - * Pick the next domain to run - */ - +/**************************************************************************** + * The main function + * - deschedule the current domain. + * - pick a new domain. + * i.e., the domain with lowest EVT. + * The runqueue should be ordered by EVT so that is easy. + ****************************************************************************/ asmlinkage void schedule(void) { - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu; - + struct task_struct *prev, *next, *next_prime, *p; + struct list_head *tmp; + int this_cpu; + s_time_t now; + s32 r_time; /* time for new dom to run */ + s32 ranfor; /* assume we never run longer than 2.1s! */ + s32 mcus; + u32 next_evt, next_prime_evt, min_avt; + + perfc_incrc(sched_run1); need_resched_back: + perfc_incrc(sched_run2); + + now = NOW(); + next = NULL; prev = current; this_cpu = prev->processor; + /* remove timer */ + rem_ac_timer(&schedule_data[this_cpu].s_timer); + + /* + * deschedule the current domain + */ + spin_lock_irq(&schedule_data[this_cpu].lock); ASSERT(!in_interrupt()); ASSERT(__task_on_runqueue(prev)); - __move_last_runqueue(prev); + if (is_idle_task(prev)) + goto deschedule_done; - switch ( prev->state ) - { + /* do some accounting */ + ranfor = (s32)(now - prev->lastschd); + ASSERT((ranfor>0)); + prev->cpu_time += ranfor; + + /* calculate mcu and update avt */ + mcus = ranfor/MCU; + if (ranfor % MCU) mcus ++; /* always round up */ + prev->avt += mcus * prev->mcu_advance; + prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */ + + /* dequeue */ + __del_from_runqueue(prev); + switch (prev->state) { case TASK_INTERRUPTIBLE: - if ( signal_pending(prev) ) - { - prev->state = TASK_RUNNING; + if (signal_pending(prev)) { + prev->state = TASK_RUNNING; /* but has events pending */ break; } + case TASK_UNINTERRUPTIBLE: + case TASK_WAIT: + case TASK_DYING: default: - __del_from_runqueue(prev); + /* done if not running. Else, continue */ + goto deschedule_done; case TASK_RUNNING:; } + + /* requeue */ + __add_to_runqueue_tail(prev); + + + deschedule_done: clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events); - next = NULL; - list_for_each(tmp, &schedule_data[smp_processor_id()].runqueue) { + /* + * Pick a new domain + */ + + /* we should at least have the idle task */ + ASSERT(!list_empty(&schedule_data[this_cpu].runqueue)); + + /* + * scan through the run queue and pick the task with the lowest evt + * *and* the task the second lowest evt. + * this code is O(n) but we expect n to be small. + */ + next = schedule_data[this_cpu].idle; + next_prime = NULL; + + next_evt = 0xffffffff; + next_prime_evt = 0xffffffff; + min_avt = 0xffffffff; /* to calculate svt */ + + + list_for_each(tmp, &schedule_data[this_cpu].runqueue) { p = list_entry(tmp, struct task_struct, run_list); - next = p; - if ( !is_idle_task(next) ) break; + if (p->evt < next_evt) { + next_prime = next; + next_prime_evt = next_evt; + next = p; + next_evt = p->evt; + } else if (next_prime_evt == 0xffffffff) { + next_prime_evt = p->evt; + next_prime = p; + } else if (p->evt < next_prime_evt) { + next_prime_evt = p->evt; + next_prime = p; + } + /* determine system virtual time */ + if (p->avt < min_avt) + min_avt = p->avt; } + ASSERT(next != NULL); /* we should have at least the idle task */ + + /* update system virtual time */ + if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt; + + if (is_idle_task(next)) { + r_time = ctx_allow; + goto sched_done; + } + + if (next_prime == NULL || is_idle_task(next_prime)) { + /* we have only one runable task besides the idle task */ + r_time = 10 * ctx_allow; /* RN: random constant */ + goto sched_done; + } + + /* + * if we are here we have two runable tasks. + * work out how long 'next' can run till its evt is greater than + * 'next_prime's evt. Taking context switch allowance into account. + */ + ASSERT(next_prime->evt > next->evt); + r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow; + + sched_done: + ASSERT(r_time != 0); + ASSERT(r_time > ctx_allow); + + if ( (r_time==0) || (r_time < ctx_allow)) { + printk("[%02d]: %lx\n", this_cpu, r_time); + dump_rqueue(&schedule_data[this_cpu].runqueue, "foo"); + } + prev->has_cpu = 0; next->has_cpu = 1; @@ -257,6 +380,17 @@ asmlinkage void schedule(void) schedule_data[this_cpu].prev = prev; schedule_data[this_cpu].curr = next; + next->lastschd = now; + + /* reprogramm the timer */ + timer_redo: + schedule_data[this_cpu].s_timer.expires = now + r_time; + if (add_ac_timer(&schedule_data[this_cpu].s_timer) == 1) { + printk("SCHED[%02d]: Shit this shouldn't happen\n", this_cpu); + now = NOW(); + goto timer_redo; + } + spin_unlock_irq(&schedule_data[this_cpu].lock); if ( unlikely(prev == next) ) @@ -266,6 +400,8 @@ asmlinkage void schedule(void) goto same_process; } + perfc_incrc(sched_ctx); + prepare_to_switch(); switch_to(prev, next); prev = schedule_data[this_cpu].prev; @@ -274,67 +410,56 @@ asmlinkage void schedule(void) if ( prev->state == TASK_DYING ) release_task(prev); same_process: + /* update the domains notion of time */ update_dom_time(current->shared_info); - if ( test_bit(_HYP_EVENT_NEED_RESCHED, ¤t->hyp_events) ) + if ( test_bit(_HYP_EVENT_NEED_RESCHED, ¤t->hyp_events) ) { goto need_resched_back; + } return; } /* - * The scheduling timer. + * The scheduler timer. */ -static __cacheline_aligned int count[NR_CPUS]; static void sched_timer(unsigned long foo) { - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct task_struct *curr = schedule_data[cpu].curr; - s_time_t now; - int res; - - /* reschedule after each 5 ticks */ - if (count[cpu] >= 5) { - set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); - count[cpu] = 0; - } - count[cpu]++; + /* cause a reschedule */ + set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events); + perfc_incrc(sched_irq); +} - /* - * deliver virtual timer interrups to domains if we are CPU 0 XXX RN: We - * don't have a per CPU list of domains yet. Otherwise would use that. - * Plus, this should be removed anyway once Domains "know" about virtual - * time and timeouts. But, it's better here then where it was before. - */ - if (cpu == 0) { - struct task_struct *p; - unsigned long cpu_mask = 0; - - /* send virtual timer interrupt */ - read_lock(&tasklist_lock); - p = &idle0_task; - do { - if ( is_idle_task(p) ) continue; - cpu_mask |= mark_guest_event(p, _EVENT_TIMER); - } - while ( (p = p->next_task) != &idle0_task ); - read_unlock(&tasklist_lock); - guest_event_notify(cpu_mask); +/* + * The Domain virtual time timer + */ +static void virt_timer(unsigned long foo) +{ + unsigned long cpu_mask = 0; + struct task_struct *p; + s_time_t now; + int res; + + /* send virtual timer interrupt */ + read_lock(&tasklist_lock); + p = &idle0_task; + do { + if ( is_idle_task(p) ) continue; + cpu_mask |= mark_guest_event(p, _EVENT_TIMER); } + while ( (p = p->next_task) != &idle0_task ); + read_unlock(&tasklist_lock); + guest_event_notify(cpu_mask); - again: + again: now = NOW(); - s_timer[cpu].expires = now + MILLISECS(10); - res=add_ac_timer(&s_timer[cpu]); - - TRC(printk("SCHED[%02d] timer(): now=0x%08X%08X timo=0x%08X%08X\n", - cpu, (u32)(now>>32), (u32)now, - (u32)(s_timer[cpu].expires>>32), (u32)s_timer[cpu].expires)); + v_timer.expires = now + MILLISECS(10); + res=add_ac_timer(&v_timer); if (res==1) goto again; - } - /* * Initialise the data structures */ @@ -350,11 +475,15 @@ void __init scheduler_init(void) spin_lock_init(&schedule_data[i].lock); schedule_data[i].prev = &idle0_task; schedule_data[i].curr = &idle0_task; - + /* a timer for each CPU */ - init_ac_timer(&s_timer[i]); - s_timer[i].function = &sched_timer; + init_ac_timer(&schedule_data[i].s_timer); + schedule_data[i].s_timer.function = &sched_timer; + } + schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */ + init_ac_timer(&v_timer); + v_timer.function = &virt_timer; } /* @@ -362,10 +491,121 @@ void __init scheduler_init(void) * This has to be done *after* the timers, e.g., APICs, have been initialised */ void schedulers_start(void) -{ +{ printk("Start schedulers\n"); __cli(); sched_timer(0); + virt_timer(0); smp_call_function((void *)sched_timer, NULL, 1, 1); __sti(); } + + +/**************************************************************************** + * Functions for legacy support. + * Schedule timeout is used at a number of places and is a bit meaningless + * in the context of Xen, as Domains are not able to call these and all + * there entry points into Xen should be asynchronous. If a domain wishes + * to block for a while it should use Xen's sched_op entry point. + ****************************************************************************/ + +static void process_timeout(unsigned long __data) +{ + struct task_struct * p = (struct task_struct *) __data; + wake_up(p); +} + +long schedule_timeout(long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable in the caller. + * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the + * negative value but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be 0 since no + * piece of kernel is supposed to do a check for a negative retval of + * schedule_timeout() (since it should never happens anyway). You just + * have the printk() that will tell you if something is gone wrong and + * where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} + +/**************************************************************************** + * debug function + ****************************************************************************/ + +static void dump_rqueue(struct list_head *queue, char *name) +{ + struct list_head *list; + int loop = 0; + struct task_struct *p; + + printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue, + (unsigned long) queue->next, (unsigned long) queue->prev); + list_for_each (list, queue) { + p = list_entry(list, struct task_struct, run_list); + printk("%3d: %3d has=%c mcua=0x%04X ev=0x%08X av=0x%08X c=0x%X%08X\n", + loop++, p->domain, + p->has_cpu ? 'T':'F', + p->mcu_advance, p->evt, p->avt, + (u32)(p->cpu_time>>32), (u32)p->cpu_time); + printk(" l: %lx n: %lx p: %lx\n", + (unsigned long)list, (unsigned long)list->next, + (unsigned long)list->prev); + } + return; +} + +void dump_runq(u_char key, void *dev_id, struct pt_regs *regs) +{ + u_long flags; + s_time_t now = NOW(); + int i; + + printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n", + (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); + for (i = 0; i < smp_num_cpus; i++) { + spin_lock_irqsave(&schedule_data[i].lock, flags); + printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt); + dump_rqueue(&schedule_data[i].runqueue, "rq"); + spin_unlock_irqrestore(&schedule_data[i].lock, flags); + } + return; +} + diff --git a/xen/include/xeno/ac_timer.h b/xen/include/xeno/ac_timer.h index 7cf568d2fc..280f377d17 100644 --- a/xen/include/xeno/ac_timer.h +++ b/xen/include/xeno/ac_timer.h @@ -43,10 +43,10 @@ */ struct ac_timer { - struct list_head timer_list; - s_time_t expires; /* system time time out value */ - unsigned long data; - void (*function)(unsigned long); + struct list_head timer_list; + s_time_t expires; /* system time time out value */ + unsigned long data; + void (*function)(unsigned long); }; /* interface for "clients" */ @@ -55,7 +55,7 @@ extern int rem_ac_timer(struct ac_timer *timer); extern int mod_ac_timer(struct ac_timer *timer, s_time_t new_time); static inline void init_ac_timer(struct ac_timer *timer) { - //timer->next = NULL; + timer->timer_list.next = NULL; } /* interface used by programmable timer, implemented hardware dependent */ diff --git a/xen/include/xeno/dom0_ops.h b/xen/include/xeno/dom0_ops.h index 5e498de1bc..c0159d12cc 100644 --- a/xen/include/xeno/dom0_ops.h +++ b/xen/include/xeno/dom0_ops.h @@ -4,8 +4,11 @@ * Process command requests from domain-0 guest OS. * * Copyright (c) 2002, K A Fraser, B Dragovic + * + * MUST BE KEPT IN SYNC WITH xenolinux<*>/arch/xeno/drivers/dom0/dom0_ops.h */ + #ifndef __DOM0_OPS_H__ #define __DOM0_OPS_H__ @@ -13,6 +16,8 @@ #define DOM0_KILLDOMAIN 1 #define DOM0_GETMEMLIST 2 #define DOM0_STARTDOM 4 +#define DOM0_BVTCTL 6 +#define DOM0_ADJUSTDOM 7 #define MAX_CMD_LEN 256 @@ -48,6 +53,20 @@ typedef struct domain_launch char cmd_line[MAX_CMD_LEN]; } dom_meminfo_t; +typedef struct dom0_bvtctl_st +{ + unsigned long ctx_allow; /* context switch allowance */ +} dom0_bvtctl_t; + +typedef struct dom0_adjustdom_st +{ + unsigned int domain; /* domain id */ + unsigned long mcu_adv; /* mcu advance: inverse of weight */ + unsigned long warp; /* time warp */ + unsigned long warpl; /* warp limit */ + unsigned long warpu; /* unwarp time requirement */ +} dom0_adjustdom_t; + typedef struct dom0_op_st { unsigned long cmd; @@ -56,6 +75,8 @@ typedef struct dom0_op_st dom0_newdomain_t newdomain; dom0_killdomain_t killdomain; dom0_getmemlist_t getmemlist; + dom0_bvtctl_t bvtctl; + dom0_adjustdom_t adjustdom; dom_meminfo_t meminfo; } u; diff --git a/xen/include/xeno/perfc_defn.h b/xen/include/xeno/perfc_defn.h index 41dd48f73b..f006079085 100644 --- a/xen/include/xeno/perfc_defn.h +++ b/xen/include/xeno/perfc_defn.h @@ -1,10 +1,13 @@ - PERFCOUNTER_CPU( irqs, "#interrupts" ) PERFCOUNTER_CPU( irq_time, "cycles spent in irq handler" ) -PERFCOUNTER( blockio_tx, "block io: messages received from tx queue" ) -PERFCOUNTER( blockio_rx, "block io: messages sent on rx queue" ) +PERFCOUNTER( blockio_tx, "block io: messages received from tx queue" ) +PERFCOUNTER( blockio_rx, "block io: messages sent on rx queue" ) -PERFCOUNTER_CPU( apic_timer, "apic timer interrupts" ) -PERFCOUNTER_CPU( ac_timer_max, "ac_timer max error" ) +PERFCOUNTER_CPU( apic_timer, "apic timer interrupts" ) +PERFCOUNTER_CPU( ac_timer_max, "ac_timer max error (ns)" ) +PERFCOUNTER_CPU( sched_irq, "sched: timer" ) +PERFCOUNTER_CPU( sched_run1, "sched: calls to schedule" ) +PERFCOUNTER_CPU( sched_run2, "sched: runs through scheduler" ) +PERFCOUNTER_CPU( sched_ctx, "sched: context switches" ) diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h index 6d1842a2ea..dbbf6a927e 100644 --- a/xen/include/xeno/sched.h +++ b/xen/include/xeno/sched.h @@ -12,6 +12,10 @@ #include <hypervisor-ifs/hypervisor-if.h> #include <xeno/dom0_ops.h> +#include <xeno/list.h> +#include <xeno/time.h> +#include <xeno/ac_timer.h> + extern unsigned long volatile jiffies; extern rwlock_t tasklist_lock; @@ -59,18 +63,48 @@ extern struct mm_struct init_mm; struct task_struct { - int processor; - int state; - int hyp_events; - unsigned int domain; + /* + * DO NOT CHANGE THE ORDER OF THE FOLLOWING. + * There offsets are hardcoded in entry.S + */ + + int processor; /* 00: current processor */ + int state; /* 04: current run state */ + int hyp_events; /* 08: pending events */ + unsigned int domain; /* 12: domain id */ /* An unsafe pointer into a shared data area. */ - shared_info_t *shared_info; + shared_info_t *shared_info; /* 16: shared data area */ + + /* + * From here on things can be added and shuffled without special attention + */ struct list_head pg_head; unsigned int tot_pages; /* number of pages currently possesed */ unsigned int max_pages; /* max number of pages that can be possesed */ + /* scheduling */ + struct list_head run_list; /* the run list */ + int has_cpu; + int policy; + int counter; + + struct ac_timer blt; /* blocked timeout */ + + s_time_t lastschd; /* time this domain was last scheduled */ + s_time_t cpu_time; /* total CPU time received till now */ + + unsigned long mcu_advance; /* inverse of weight */ + s32 avt; /* actual virtual time */ + s32 evt; /* effective virtual time */ + long warp; /* virtual time warp */ + long warpl; /* warp limit */ + long warpu; /* unwarp time requirement */ + long warped; /* time it ran warped last time */ + long uwarped; /* time it ran unwarped last time */ + + /* Network I/O */ net_ring_t *net_ring_base; net_vif_t *net_vif_list[MAX_GUEST_VIFS]; @@ -85,10 +119,7 @@ struct task_struct { segment_t *segment_list[XEN_MAX_SEGMENTS]; /* vhd */ int segment_count; - int has_cpu, policy, counter; - - struct list_head run_list; - + /* VM */ struct mm_struct mm; /* We need this lock to check page types and frob reference counts. */ spinlock_t page_lock; @@ -127,7 +158,7 @@ struct task_struct { #define TASK_RUNNING 0 #define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 -#define TASK_WAIT 4 +#define TASK_WAIT 4 #define TASK_DYING 16 /* #define TASK_STOPPED 8 not really used */ @@ -141,6 +172,8 @@ struct task_struct { domain: IDLE_DOMAIN_ID, \ state: TASK_RUNNING, \ has_cpu: 0, \ + evt: 0xffffffff, \ + avt: 0xffffffff, \ mm: IDLE0_MM, \ addr_limit: KERNEL_DS, \ active_mm: &idle0_task.mm, \ @@ -153,7 +186,7 @@ struct task_struct { #define is_idle_task(_p) ((_p)->domain == IDLE_DOMAIN_ID) #ifndef IDLE0_TASK_SIZE -#define IDLE0_TASK_SIZE 2048*sizeof(long) +#define IDLE0_TASK_SIZE 2048*sizeof(long) #endif union task_union { @@ -202,6 +235,9 @@ void scheduler_init(void); void schedulers_start(void); void sched_add_domain(struct task_struct *p); void sched_rem_domain(struct task_struct *p); +long sched_bvtctl(unsigned long ctx_allow); +long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, + unsigned long warpl, unsigned long warpu); int wake_up(struct task_struct *p); long schedule_timeout(long timeout); long do_yield(void); diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile index 4738fc0ba4..eeb3413842 100644 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile @@ -1,3 +1,3 @@ O_TARGET := dom0.o -obj-y := dom0_memory.o dom0_core.o vfr.o +obj-y := dom0_memory.o dom0_core.o vfr.o sched_ops.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h index 6c60a93ff6..22ebd7aba0 100644 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h @@ -4,15 +4,20 @@ * Process command requests from domain-0 guest OS. * * Copyright (c) 2002, K A Fraser, B Dragovic + * + * MUST BE KEPT IN SYNC WITH xen/include/xeno/dom0_ops.h + * MUST BE KEPT IN SYNC WITH tools/domain_builder/dom0_ops.h */ #define DOM0_NEWDOMAIN 0 #define DOM0_KILLDOMAIN 1 #define DOM0_GETMEMLIST 2 #define DOM0_STARTDOM 4 -#define MAP_DOM_MEM 6 /* Not passed down to Xen */ -#define DO_PGUPDATES 7 /* Not passed down to Xen */ -#define MAX_CMD 8 +#define DOM0_BVTCTL 6 +#define DOM0_ADJUSTDOM 7 +#define MAP_DOM_MEM 8 /* Not passed down to Xen */ +#define DO_PGUPDATES 9 /* Not passed down to Xen */ +#define MAX_CMD 10 #define MAX_CMD_LEN 256 @@ -20,8 +25,8 @@ typedef struct dom0_newdomain_st { unsigned int domain; unsigned int memory_kb; - unsigned int num_vifs; // temporary - unsigned long pg_head; // return parameter + unsigned int num_vifs; /* temporary */ + unsigned long pg_head; /* return parameter */ } dom0_newdomain_t; typedef struct dom0_killdomain_st @@ -37,6 +42,20 @@ typedef struct dom0_getmemlist_st void *buffer; } dom0_getmemlist_t; +typedef struct dom0_bvtctl_st +{ + unsigned long ctx_allow; /* context switch allowance */ +} dom0_bvtctl_t; + +typedef struct dom0_adjustdom_st +{ + unsigned int domain; /* domain id */ + unsigned long mcu_adv; /* mcu advance: inverse of weight */ + unsigned long warp; /* time warp */ + unsigned long warpl; /* warp limit */ + unsigned long warpu; /* unwarp time requirement */ +} dom0_adjustdom_t; + /* This is entirely processed by XenoLinux */ typedef struct dom_mem { @@ -64,6 +83,8 @@ typedef struct domain_launch char cmd_line[MAX_CMD_LEN]; } dom_meminfo_t; + + typedef struct dom0_op_st { unsigned long cmd; @@ -72,6 +93,8 @@ typedef struct dom0_op_st dom0_newdomain_t newdomain; dom0_killdomain_t killdomain; dom0_getmemlist_t getmemlist; + dom0_bvtctl_t bvtctl; + dom0_adjustdom_t adjustdom; dom_mem_t dommem; dom_pgupdate_t pgupdate; dom_meminfo_t meminfo; diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c new file mode 100644 index 0000000000..9c5fce7857 --- /dev/null +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c @@ -0,0 +1,137 @@ +/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- + **************************************************************************** + * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge + **************************************************************************** + * + * File: sched_ops.c + * Author: Rolf Neugebauer (neugebar@dcs.gla.ac.uk) + * Changes: + * + * Date: Mar 2003 + * + * Environment: XenoLinux + * Description: Dom0 Control interface to scheduler in Xen + * + * code based on Andy's vfr parsing code + * + * Commands understood by the interface: + * + * C <context swith allowance> + * S <did> <mcu advance> <warp> <warp limit> <unwarp limit> + * + **************************************************************************** + * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $ + **************************************************************************** + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> + +#include "dom0_ops.h" + +#define SCHED_ENTRY "sched" +extern struct proc_dir_entry *xeno_base; +static struct proc_dir_entry *sched_pde; + +static unsigned char readbuf[1024]; + +static int sched_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + strcpy(page, readbuf); + *readbuf = '\0'; + *eof = 1; + *start = page; + return strlen(page); +} + + +static int sched_write_proc(struct file *file, const char *buffer, + u_long count, void *data) +{ + dom0_op_t op; + + int ret, len; + int ts, te, tl; /* token start, end, and length */ + + /* Only admin can adjust scheduling parameters */ + if ( !capable(CAP_SYS_ADMIN) ) + return -EPERM; + + /* parse the commands */ + len = count; + ts = te = 0; + + while ( count && isspace(buffer[ts]) ) { ts++; count--; } /*skip spaces*/ + te = ts; + while ( count && !isspace(buffer[te]) ) { te++; count--; } /*command end*/ + if ( te <= ts ) goto bad; + tl = te - ts; + + if ( strncmp(&buffer[ts], "C", tl) == 0 ) { + op.cmd = DOM0_BVTCTL; + } else if ( strncmp(&buffer[ts], "S", tl) == 0 ) { + op.cmd = DOM0_ADJUSTDOM; + } else + goto bad; + + /* skip whitspaces and get first parameter */ + ts = te; while ( count && isspace(buffer[ts]) ) { ts++; count--; } + te = ts; while ( count && !isspace(buffer[te]) ) { te++; count--; } + if ( te <= ts ) goto bad; + tl = te - ts; + if ( !isdigit(buffer[ts]) ) goto bad; + + if (op.cmd == DOM0_BVTCTL) { + /* get context switch allowance */ + sscanf(&buffer[ts], "%lu", &op.u.bvtctl.ctx_allow); + } else if (op.cmd == DOM0_ADJUSTDOM) { + sscanf(&buffer[ts], "%u %lu %lu %lu %lu", + &op.u.adjustdom.domain, + &op.u.adjustdom.mcu_adv, + &op.u.adjustdom.warp, + &op.u.adjustdom.warpl, + &op.u.adjustdom.warpu); + } + ret = HYPERVISOR_dom0_op(&op); + return sizeof(op); + + bad: + return -EINVAL; + +} + + +/* + * main scheduler interface driver driver initialization function. + */ +static int __init init_module(void) +{ + printk(KERN_ALERT "Starting Domain Scheduler Control Interface\n"); + + sched_pde = create_proc_entry(SCHED_ENTRY, 0600, xeno_base); + if ( sched_pde == NULL ) + { + printk(KERN_ALERT "Unable to create dom scheduler proc entry!"); + return -1; + } + + sched_pde->read_proc = sched_read_proc; + sched_pde->write_proc = sched_write_proc; + + return 0; +} + +static void __exit cleanup_module(void) +{ +} + +module_init(init_module); +module_exit(cleanup_module); + |