aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-x.bk-to-hg2
-rwxr-xr-x.hg-to-bk3
-rw-r--r--.rootkeys3
-rw-r--r--tools/domain_builder/dom0_ops.h81
-rw-r--r--xen/arch/i386/apic.c8
-rw-r--r--xen/common/ac_timer.c166
-rw-r--r--xen/common/dom0_ops.c28
-rw-r--r--xen/common/domain.c18
-rw-r--r--xen/common/keyhandler.c15
-rw-r--r--xen/common/schedule.c556
-rw-r--r--xen/include/xeno/ac_timer.h10
-rw-r--r--xen/include/xeno/dom0_ops.h21
-rw-r--r--xen/include/xeno/perfc_defn.h13
-rw-r--r--xen/include/xeno/sched.h58
-rw-r--r--xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile2
-rw-r--r--xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h33
-rw-r--r--xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c137
17 files changed, 814 insertions, 340 deletions
diff --git a/.bk-to-hg b/.bk-to-hg
index a1e10bd62c..5bd381ab73 100755
--- a/.bk-to-hg
+++ b/.bk-to-hg
@@ -2,5 +2,7 @@
set -e
test -L old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs
rm old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs
+test -L tools/domain_builder/dom0_ops.h
+rm tools/domain_builder/dom0_ops.h
(find -depth -type d -print | xargs -r rmdir 2>/dev/null) || true
exit 0
diff --git a/.hg-to-bk b/.hg-to-bk
index 817e56037e..dc30e75316 100755
--- a/.hg-to-bk
+++ b/.hg-to-bk
@@ -5,5 +5,8 @@ mkdir -p old/xenolinux-2.4.16-sparse
mkdir -p old/xenolinux-2.4.16-sparse/include
mkdir -p old/xenolinux-2.4.16-sparse/include/asm-xeno
ln -s ../../../xen-2.4.16/include/hypervisor-ifs old/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor-ifs
+mkdir -p tools
+mkdir -p tools/domain_builder
+ln -s ../../xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h tools/domain_builder/dom0_ops.h
(find -depth -type d -print | xargs -r rmdir 2>/dev/null) || true
exit 0
diff --git a/.rootkeys b/.rootkeys
index dc1f07ed27..cb721ac57b 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -182,7 +182,7 @@
3e4d00468-FN2VDeEHo96zxrMHK_mA tools/domain_builder/Makefile
3e4d0046SPau_y0sw2WLJz8QkqNoRA tools/domain_builder/README
3e4d0046bbdH0GsI9J_1Eb4ZQHfIiQ tools/domain_builder/dom0_defs.h
-3e4d0046RgYCfGOw6qGz_7kYLMV2Vw tools/domain_builder/dom0_ops.h
+3e71f9b871pvOAxDrhxpC4N4mHkbww tools/domain_builder/dom0_ops.h
3e4d0046ouLij_CMN_j7-dUHZIBI_A tools/domain_builder/dom_builder.c
3e4d0046EKs06fY0CWDEgZQcn7DYUg tools/domain_builder/dom_kill.c
3e4d0046aPbGiRTtdWxqY5b3ytWurA tools/domain_builder/hypervisor_defs.h
@@ -487,6 +487,7 @@
3e5a4e65BXtftInNHUC2PjDfPhdZZA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_core.c
3e5a4e65uXAx05p6B1-HU2tijuw8qA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_memory.c
3e5a4e65EOOLlPwXnhSuX-iVdWLmnA xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
+3e6dba59C8o0kBks7UZ4IW_FY853Aw xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c
3e5a4e65gfn_ltB8ujHMVFApnTTNRQ xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/vfr.c
3e5a4e65gZBRBB6RsSVg1c9iahigAw xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/Makefile
3e5a4e65ZxKrbFetVB84JhrTyZ1YuQ xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c
diff --git a/tools/domain_builder/dom0_ops.h b/tools/domain_builder/dom0_ops.h
deleted file mode 100644
index 6c60a93ff6..0000000000
--- a/tools/domain_builder/dom0_ops.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/******************************************************************************
- * dom0_ops.h
- *
- * Process command requests from domain-0 guest OS.
- *
- * Copyright (c) 2002, K A Fraser, B Dragovic
- */
-
-#define DOM0_NEWDOMAIN 0
-#define DOM0_KILLDOMAIN 1
-#define DOM0_GETMEMLIST 2
-#define DOM0_STARTDOM 4
-#define MAP_DOM_MEM 6 /* Not passed down to Xen */
-#define DO_PGUPDATES 7 /* Not passed down to Xen */
-#define MAX_CMD 8
-
-#define MAX_CMD_LEN 256
-
-typedef struct dom0_newdomain_st
-{
- unsigned int domain;
- unsigned int memory_kb;
- unsigned int num_vifs; // temporary
- unsigned long pg_head; // return parameter
-} dom0_newdomain_t;
-
-typedef struct dom0_killdomain_st
-{
- unsigned int domain;
- int force;
-} dom0_killdomain_t;
-
-typedef struct dom0_getmemlist_st
-{
- unsigned long start_pfn;
- unsigned long num_pfns;
- void *buffer;
-} dom0_getmemlist_t;
-
-/* This is entirely processed by XenoLinux */
-typedef struct dom_mem
-{
- unsigned int domain;
- unsigned long vaddr;
- unsigned long start_pfn;
- int tot_pages;
-} dom_mem_t;
-
-/* This is entirely processed by XenoLinux */
-typedef struct dom_pgupdate
-{
- unsigned long pgt_update_arr;
- unsigned long num_pgt_updates;
-} dom_pgupdate_t;
-
-typedef struct domain_launch
-{
- unsigned int domain;
- unsigned long l2_pgt_addr;
- unsigned long virt_load_addr;
- unsigned long virt_shinfo_addr;
- unsigned long virt_startinfo_addr;
- unsigned int num_vifs;
- char cmd_line[MAX_CMD_LEN];
-} dom_meminfo_t;
-
-typedef struct dom0_op_st
-{
- unsigned long cmd;
- union
- {
- dom0_newdomain_t newdomain;
- dom0_killdomain_t killdomain;
- dom0_getmemlist_t getmemlist;
- dom_mem_t dommem;
- dom_pgupdate_t pgupdate;
- dom_meminfo_t meminfo;
- }
- u;
-} dom0_op_t;
-
diff --git a/xen/arch/i386/apic.c b/xen/arch/i386/apic.c
index 865a279d8c..0acf7067c3 100644
--- a/xen/arch/i386/apic.c
+++ b/xen/arch/i386/apic.c
@@ -659,6 +659,13 @@ int reprogram_ac_timer(s_time_t timeout)
s_time_t expire;
u64 apic_tmict;
+ if (timeout == 0) {
+ /* XXX RN: not sure if this disables it or cause interruptto
+ * go off imediately */
+ apic_tmict = 0;
+ goto reprogram;
+ }
+
now = NOW();
expire = timeout - now; /* value from now */
@@ -680,6 +687,7 @@ int reprogram_ac_timer(s_time_t timeout)
return 0;
}
+ reprogram:
/* programm timer */
apic_write(APIC_TMICT, (unsigned long)apic_tmict);
diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c
index dc70de4e0c..73ac893e08 100644
--- a/xen/common/ac_timer.c
+++ b/xen/common/ac_timer.c
@@ -23,7 +23,6 @@
#include <xeno/errno.h>
#include <xeno/sched.h>
#include <xeno/lib.h>
-#include <xeno/config.h>
#include <xeno/smp.h>
#include <xeno/perfc.h>
@@ -41,10 +40,10 @@
#define TRC(_x)
#endif
-/*
+/*****************************************************************************
* We pull handlers off the timer list this far in future,
* rather than reprogramming the time hardware.
- */
+ *****************************************************************************/
#define TIMER_SLOP (50*1000) /* ns */
/* A timer list per CPU */
@@ -58,30 +57,29 @@ static ac_timers_t ac_timers[NR_CPUS];
/* local prototypes */
static int detach_ac_timer(struct ac_timer *timer);
-/*static void ac_timer_debug(unsigned long);*/
-/*
+
+/*****************************************************************************
* add a timer.
* return value:
* 0: success
* 1: failure, timer in the past or timeout value to small
* -1: failure, timer uninitialised
* fail
- */
+ *****************************************************************************/
int add_ac_timer(struct ac_timer *timer)
{
- int cpu = smp_processor_id();
- unsigned long flags;
- s_time_t now;
+ int cpu = smp_processor_id();
+ unsigned long flags;
+ s_time_t now;
/* make sure timeout value is in the future */
+
now = NOW();
- TRC(printk("ACT [%02d] add(): now=%lld timo=%lld\n",
- cpu, now, timer->expires));
- if (timer->expires <= now) {
- printk("ACT[%02d] add_ac_timer: now=0x%08X%08X > expire=0x%08X%08X\n",
- cpu, (u32)(now>>32), (u32)now,
- (u32)(timer->expires>>32), (u32)timer->expires);
+ if (timer->expires <= now) {
+ TRC(printk("ACT[%02d] add_ac_timer:now=0x%08X%08X>expire=0x%08X%08X\n",
+ cpu, (u32)(now>>32), (u32)now,
+ (u32)(timer->expires>>32), (u32)timer->expires));
return 1;
}
spin_lock_irqsave(&ac_timers[cpu].lock, flags);
@@ -90,71 +88,57 @@ int add_ac_timer(struct ac_timer *timer)
* reprogramm the timer
*/
if (list_empty(&ac_timers[cpu].timers)) {
- /* Reprogramm and add to head of list */
if (!reprogram_ac_timer(timer->expires)) {
- /* failed */
- printk("ACT [%02d] add(): add at head failed\n", cpu);
+ printk("ACT[%02d] add at head failed\n", cpu);
spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
- return 1;
+ return 1; /* failed */
}
list_add(&timer->timer_list, &ac_timers[cpu].timers);
- TRC(printk("ACT [%02d] add(0x%08X%08X): added at head\n", cpu,
- (u32)(timer->expires>>32), (u32)timer->expires));
} else {
struct list_head *pos;
- struct ac_timer *t;
- for (pos = ac_timers[cpu].timers.next;
- pos != &ac_timers[cpu].timers;
- pos = pos->next) {
+ struct ac_timer *t;
+
+ list_for_each(pos, &ac_timers[cpu].timers) {
t = list_entry(pos, struct ac_timer, timer_list);
if (t->expires > timer->expires)
break;
}
+ list_add (&(timer->timer_list), pos->prev);
- if (pos->prev == &ac_timers[cpu].timers) {
- /* added to head, reprogramm timer */
+ if (timer->timer_list.prev == &ac_timers[cpu].timers) {
+ /* added at head */
if (!reprogram_ac_timer(timer->expires)) {
- /* failed */
- TRC(printk("ACT [%02d] add(): add at head failed\n", cpu));
+ printk("ACT[%02d] add at head failed\n", cpu);
+ detach_ac_timer(timer);
spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
- return 1;
+ return 1; /* failed */
}
- list_add (&(timer->timer_list), pos->prev);
- TRC(printk("ACT [%02d] add(0x%08X%08X): added at head\n", cpu,
- (u32)(timer->expires>>32), (u32)timer->expires));
- } else {
- list_add (&(timer->timer_list), pos->prev);
- TRC(printk("ACT [%02d] add(0x%08X%08X): add < exp=0x%08X%08X\n",
- cpu,
- (u32)(timer->expires>>32), (u32)timer->expires,
- (u32)(t->expires>>32), (u32)t->expires));
}
}
spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
return 0;
}
-/*
- * remove a timer
+/*****************************************************************************
+ * detach a timer (no locking)
* return values:
* 0: success
* -1: bogus timer
- */
+ *****************************************************************************/
static int detach_ac_timer(struct ac_timer *timer)
{
- TRC(int cpu = smp_processor_id());
TRC(printk("ACT [%02d] detach(): \n", cpu));
list_del(&timer->timer_list);
timer->timer_list.next = NULL;
return 0;
}
-/*
+/*****************************************************************************
* remove a timer
* return values:
* 0: success
* -1: bogus timer
- */
+ *****************************************************************************/
int rem_ac_timer(struct ac_timer *timer)
{
int cpu = smp_processor_id();
@@ -163,19 +147,30 @@ int rem_ac_timer(struct ac_timer *timer)
TRC(printk("ACT [%02d] remove(): timo=%lld \n", cpu, timer->expires));
spin_lock_irqsave(&ac_timers[cpu].lock, flags);
- if (timer->timer_list.next)
- res = detach_ac_timer(timer);
+ if (timer->timer_list.next) {
+ res = detach_ac_timer(timer);
+
+ if (timer->timer_list.prev == &ac_timers[cpu].timers) {
+ /* just removed the head */
+ if (list_empty(&ac_timers[cpu].timers)) {
+ reprogram_ac_timer((s_time_t) 0);
+ }
+ /* XXX should actaully reprogramm APIC to new head */
+ }
+ } else
+ res = -1;
+
spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
return res;
}
-/*
+/*****************************************************************************
* modify a timer, i.e., set a new timeout value
* return value:
* 0: sucess
* -1: error
- */
+ *****************************************************************************/
int mod_ac_timer(struct ac_timer *timer, s_time_t new_time)
{
if (rem_ac_timer(timer) != 0)
@@ -186,10 +181,10 @@ int mod_ac_timer(struct ac_timer *timer, s_time_t new_time)
return 0;
}
-/*
+/*****************************************************************************
* do_ac_timer
* deal with timeouts and run the handlers
- */
+ *****************************************************************************/
void do_ac_timer(void)
{
int cpu = smp_processor_id();
@@ -206,15 +201,21 @@ void do_ac_timer(void)
/* Sanity: is the timer list empty? */
if ( list_empty(&ac_timers[cpu].timers) ) {
- printk("ACT[%02d] do_ac_timer(): timer irq without timer\n", cpu);
+ /*
+ * XXX RN: This shouldn't happen, but does! Two possibilities:
+ * - Race condition between removing and reseting APIC
+ * - setting an APIC timeout value of 0 causes an immediate
+ * timer interrupt to fire.
+ * None of these should be critical!
+ */
+ spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
return;
}
/* Handle all timeouts in the near future. */
while ( !list_empty(&ac_timers[cpu].timers) )
{
- t = list_entry(ac_timers[cpu].timers.next,
- struct ac_timer, timer_list);
+ t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list);
if ( t->expires > (NOW() + TIMER_SLOP) ) break;
/* do some stats */
@@ -232,8 +233,7 @@ void do_ac_timer(void)
/* If list not empty then reprogram timer to new head of list */
if ( !list_empty(&ac_timers[cpu].timers) )
{
- t = list_entry(ac_timers[cpu].timers.next,
- struct ac_timer, timer_list);
+ t = list_entry(ac_timers[cpu].timers.next,struct ac_timer, timer_list);
if ( t->expires > 0 )
{
TRC(printk("ACT [%02d] do(): reprog timo=%lld\n",cpu,t->expires));
@@ -243,16 +243,18 @@ void do_ac_timer(void)
goto do_timer_again;
}
}
+ } else {
+ reprogram_ac_timer((s_time_t) 0);
}
spin_unlock_irqrestore(&ac_timers[cpu].lock, flags);
TRC(printk("ACT [%02d] do(): end\n", cpu));
}
-/*
+/*****************************************************************************
* debug dump_queue
* arguments: queue head, name of queue
- */
+ *****************************************************************************/
static void dump_tqueue(struct list_head *queue, char *name)
{
struct list_head *list;
@@ -272,7 +274,6 @@ static void dump_tqueue(struct list_head *queue, char *name)
return;
}
-
void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs)
{
u_long flags;
@@ -304,3 +305,50 @@ void __init ac_timer_init(void)
spin_lock_init(&ac_timers[i].lock);
}
}
+
+/*****************************************************************************
+ * GRAVEYARD
+ *****************************************************************************/
+
+#if 0
+
+#ifdef AC_TIMER_STATS
+#define BUCKETS 1000
+#define MAX_STATS
+typedef struct act_stats_st
+{
+ u32 count;
+ u32 times[2*(BUCKETS)];
+} __cacheline_aligned act_stats_t;
+static act_stats_t act_stats[NR_CPUS];
+
+#endif
+
+#ifdef AC_TIMER_STATS
+ {
+ XXX this is at the wrong place
+ s32 diff;
+ u32 i;
+ diff = ((s32)(NOW() - t->expires)) / 1000; /* delta in us */
+ if (diff < -BUCKETS)
+ diff = -BUCKETS;
+ else if (diff > BUCKETS)
+ diff = BUCKETS;
+ act_stats[cpu].times[diff+BUCKETS]++;
+ act_stats[cpu].count++;
+
+ if (act_stats[cpu].count >= 5000) {
+ printk("ACT Stats\n");
+ for (i=0; i < 2*BUCKETS; i++) {
+ if (act_stats[cpu].times[i] != 0)
+ printk("ACT [%02d]: %3dus: %5d\n",
+ cpu,i-BUCKETS, act_stats[cpu].times[i]);
+ act_stats[cpu].times[i]=0;
+ }
+ act_stats[cpu].count = 0;
+ printk("\n");
+ }
+ }
+#endif
+
+#endif /* 0 */
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c
index e451a8f3e7..e6d54e9695 100644
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -126,6 +126,34 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
}
break;
+ case DOM0_BVTCTL:
+ {
+ unsigned long ctx_allow = op.u.bvtctl.ctx_allow;
+ ret = sched_bvtctl(ctx_allow);
+
+ }
+ break;
+
+ case DOM0_ADJUSTDOM:
+ {
+ unsigned int dom = op.u.adjustdom.domain;
+ unsigned long mcu_adv = op.u.adjustdom.mcu_adv;
+ unsigned long warp = op.u.adjustdom.warp;
+ unsigned long warpl = op.u.adjustdom.warpl;
+ unsigned long warpu = op.u.adjustdom.warpu;
+
+
+ if ( dom == IDLE_DOMAIN_ID )
+ {
+ ret = -EPERM;
+ }
+ else
+ {
+ ret = sched_adjdom(dom, mcu_adv, warp, warpl, warpu);
+ }
+ }
+ break;
+
case DOM0_GETMEMLIST:
{
int i;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 32bf8b7172..5fc4304c01 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -58,10 +58,11 @@ struct task_struct *do_newdomain(unsigned int dom_id, unsigned int cpu)
SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
p->addr_limit = USER_DS;
- p->state = TASK_UNINTERRUPTIBLE;
p->active_mm = &p->mm;
p->num_net_vifs = 0;
+ sched_add_domain(p);
+
p->net_ring_base = (net_ring_t *)(p->shared_info + 1);
INIT_LIST_HEAD(&p->pg_head);
p->max_pages = p->tot_pages = 0;
@@ -115,7 +116,8 @@ void kill_domain(void)
}
printk("Killing domain %d\n", current->domain);
- current->state = TASK_DYING;
+
+ sched_rem_domain(current);
schedule();
BUG(); /* never get here */
}
@@ -293,7 +295,7 @@ int final_setup_guestos(struct task_struct * p, dom_meminfo_t * meminfo)
/* set up the shared info structure */
update_dom_time(p->shared_info);
- p->shared_info->cpu_freq = cpu_freq;
+ p->shared_info->cpu_freq = cpu_freq;
p->shared_info->domain_time = 0;
/* we pass start info struct to guest os as function parameter on stack */
@@ -516,8 +518,8 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params)
unmap_domain_mem(l1start);
/* Set up shared info area. */
- update_dom_time(p->shared_info);
- p->shared_info->cpu_freq = cpu_freq;
+ update_dom_time(p->shared_info);
+ p->shared_info->cpu_freq = cpu_freq;
p->shared_info->domain_time = 0;
@@ -555,7 +557,7 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params)
#define SHIP2GUEST(_x) (virt_shinfo_address | (((unsigned long)(_x)) & 0xFFF))
virt_startinfo_address->net_rings =
- (net_ring_t *)SHIP2GUEST(p->net_ring_base);
+ (net_ring_t *)SHIP2GUEST(p->net_ring_base);
virt_startinfo_address->num_net_rings = p->num_net_vifs;
/* Add block io interface */
@@ -597,7 +599,5 @@ int setup_guestos(struct task_struct *p, dom0_newdomain_t *params)
void __init domain_init(void)
{
- printk("Initialising domains\n");
+ printk("Initialising domains\n");
}
-
-
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 19943fff3e..dde9e0ff10 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -1,9 +1,6 @@
#include <xeno/keyhandler.h>
#include <xeno/reboot.h>
-extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs);
-extern void perfc_reset (u_char key, void *dev_id, struct pt_regs *regs);
-
#define KEY_MAX 256
#define STR_MAX 64
@@ -117,6 +114,12 @@ void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs)
}
+extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs);
+extern void perfc_reset (u_char key, void *dev_id, struct pt_regs *regs);
+extern void dump_timerq(u_char key, void *dev_id, struct pt_regs *regs);
+extern void dump_runq(u_char key, void *dev_id, struct pt_regs *regs);
+
+
void initialize_keytable()
{
int i;
@@ -126,13 +129,15 @@ void initialize_keytable()
key_table[i].handler = (key_handler *)NULL;
/* setup own handlers */
+ add_key_handler('a', dump_timerq, "dump ac_timer queues");
add_key_handler('d', dump_registers, "dump registers");
add_key_handler('h', show_handlers, "show this message");
add_key_handler('p', perfc_printall, "print performance counters");
add_key_handler('P', perfc_reset, "reset performance counters");
add_key_handler('q', do_task_queues, "dump task queues + guest state");
- add_key_handler('B', kill_dom0, "reboot machine gracefully");
- add_key_handler('R', halt_machine, "reboot machine ungracefully");
+ add_key_handler('r', dump_runq, "dump run queues");
+ add_key_handler('B', kill_dom0, "reboot machine gracefully");
+ add_key_handler('R', halt_machine, "reboot machine ungracefully");
return;
}
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 787b43d900..ce46069167 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -11,7 +11,8 @@
*
* Environment: Xen Hypervisor
* Description: CPU scheduling
- * partially moved from domain.c
+ * implements A Borrowed Virtual Time scheduler.
+ * (see Duda & Cheriton SOSP'99)
*
****************************************************************************
* $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
@@ -28,6 +29,9 @@
#include <xeno/ac_timer.h>
#include <xeno/interrupt.h>
+#include <xeno/perfc.h>
+
+
#undef SCHEDULER_TRACE
#ifdef SCHEDULER_TRACE
#define TRC(_x) _x
@@ -35,80 +39,106 @@
#define TRC(_x)
#endif
-/*
+
+#define MCU (s32)MICROSECS(100) /* Minimum unit */
+static s32 ctx_allow=(s32)MILLISECS(10); /* context switch allowance */
+
+/*****************************************************************************
* per CPU data for the scheduler.
- */
+ *****************************************************************************/
typedef struct schedule_data_st
{
- spinlock_t lock;
- struct list_head runqueue;
- struct task_struct *prev, *curr;
+ spinlock_t lock; /* lock for protecting this */
+ struct list_head runqueue; /* runqueue */
+ struct task_struct *prev, *curr; /* previous and current task */
+ struct task_struct *idle; /* idle task for this cpu */
+ u32 svt; /* system virtual time. per CPU??? */
+ struct ac_timer s_timer; /* scheduling timer */
+
} __cacheline_aligned schedule_data_t;
schedule_data_t schedule_data[NR_CPUS];
-static __cacheline_aligned struct ac_timer s_timer[NR_CPUS];
+struct ac_timer v_timer; /* scheduling timer */
+static void virt_timer(unsigned long foo);
-/*
- * Some convenience functions
- */
-static inline void __add_to_runqueue(struct task_struct * p)
+/*****************************************************************************
+ * Some convenience functions
+ *****************************************************************************/
+/* add a task to the head of the runqueue */
+static inline void __add_to_runqueue_head(struct task_struct * p)
{
+
list_add(&p->run_list, &schedule_data[p->processor].runqueue);
}
-
-static inline void __move_last_runqueue(struct task_struct * p)
+/* add a task to the tail of the runqueue */
+static inline void __add_to_runqueue_tail(struct task_struct * p)
{
- list_del(&p->run_list);
list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
}
-static inline void __move_first_runqueue(struct task_struct * p)
-{
- list_del(&p->run_list);
- list_add(&p->run_list, &schedule_data[p->processor].runqueue);
-}
-
+/* remove a task from runqueue */
static inline void __del_from_runqueue(struct task_struct * p)
{
list_del(&p->run_list);
p->run_list.next = NULL;
}
-
+/* is task on run queue? */
static inline int __task_on_runqueue(struct task_struct *p)
{
return (p->run_list.next != NULL);
}
+#define next_domain(p) \\
+ list_entry((p)->run_list.next, struct task_struct, run_list)
-/*
- * Add a new domain to the scheduler
- */
+/******************************************************************************
+* Add and remove a domain
+******************************************************************************/
void sched_add_domain(struct task_struct *p)
{
- p->state = TASK_UNINTERRUPTIBLE;
+ p->state = TASK_UNINTERRUPTIBLE;
+ p->mcu_advance = 10;
+
+ if (p->domain == IDLE_DOMAIN_ID) {
+ p->avt = 0xffffffff;
+ p->evt = 0xffffffff;
+ schedule_data[p->processor].idle = p;
+ } else {
+ /* set avt end evt to system virtual time */
+ p->avt = schedule_data[p->processor].svt;
+ p->evt = schedule_data[p->processor].svt;
+ /* RN: XXX BVT fill in other bits */
+ }
}
-/*
- * Remove domain to the scheduler
- */
void sched_rem_domain(struct task_struct *p)
{
p->state = TASK_DYING;
}
-/*
+/****************************************************************************
* wake up a domain which had been sleeping
- */
+ ****************************************************************************/
int wake_up(struct task_struct *p)
{
unsigned long flags;
int ret = 0;
+
spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
+
if ( __task_on_runqueue(p) ) goto out;
+
p->state = TASK_RUNNING;
- __add_to_runqueue(p);
+ __add_to_runqueue_head(p);
+
+ /* set the BVT parameters */
+ if (p->avt < schedule_data[p->processor].svt)
+ p->avt = schedule_data[p->processor].svt;
+
+ p->evt = p->avt; /* RN: XXX BVT deal with warping here */
+
ret = 1;
out:
@@ -116,75 +146,57 @@ int wake_up(struct task_struct *p)
return ret;
}
-static void process_timeout(unsigned long __data)
+/****************************************************************************
+ * Domain requested scheduling operations
+ ****************************************************************************/
+long do_sched_op(void)
{
- struct task_struct * p = (struct task_struct *) __data;
- wake_up(p);
+ /* XXX implement proper */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ return 0;
}
-long schedule_timeout(long timeout)
+/****************************************************************************
+ * Control the scheduler
+ ****************************************************************************/
+long sched_bvtctl(unsigned long c_allow)
{
- struct timer_list timer;
- unsigned long expire;
-
- switch (timeout)
- {
- case MAX_SCHEDULE_TIMEOUT:
- /*
- * These two special cases are useful to be comfortable in the caller.
- * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
- * negative value but I' d like to return a valid offset (>=0) to allow
- * the caller to do everything it want with the retval.
- */
- schedule();
- goto out;
- default:
- /*
- * Another bit of PARANOID. Note that the retval will be 0 since no
- * piece of kernel is supposed to do a check for a negative retval of
- * schedule_timeout() (since it should never happens anyway). You just
- * have the printk() that will tell you if something is gone wrong and
- * where.
- */
- if (timeout < 0)
- {
- printk(KERN_ERR "schedule_timeout: wrong timeout "
- "value %lx from %p\n", timeout,
- __builtin_return_address(0));
- current->state = TASK_RUNNING;
- goto out;
- }
- }
-
- expire = timeout + jiffies;
-
- init_timer(&timer);
- timer.expires = expire;
- timer.data = (unsigned long) current;
- timer.function = process_timeout;
-
- add_timer(&timer);
- schedule();
- del_timer_sync(&timer);
-
- timeout = expire - jiffies;
-
- out:
- return timeout < 0 ? 0 : timeout;
+ printk("sched: bvtctl %lu\n", c_allow);
+ ctx_allow = c_allow;
+ return 0;
}
-/* RN: XXX turn this into do_halt() */
-/*
- * yield the current process
- */
-long do_sched_op(void)
+/****************************************************************************
+ * Adjust scheduling parameter for a given domain
+ ****************************************************************************/
+long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
+ unsigned long warpl, unsigned long warpu)
{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
+ struct task_struct *p;
+
+ printk("sched: adjdom %02d %lu %lu %lu %lu\n",
+ dom, mcu_adv, warp, warpl, warpu);
+
+ p = find_domain_by_id(dom);
+ if ( p == NULL ) return -ESRCH;
+
+ spin_lock_irq(&schedule_data[p->processor].lock);
+
+ p->mcu_advance = mcu_adv;
+
+ spin_unlock_irq(&schedule_data[p->processor].lock);
+
return 0;
}
-
+/****************************************************************************
+ * cause a run through the scheduler when appropriate
+ * Appropriate is:
+ * - current task is idle task
+ * - new processes evt is lower than current one
+ * - the current task already ran for it's context switch allowance
+ ****************************************************************************/
void reschedule(struct task_struct *p)
{
int cpu = p->processor;
@@ -192,16 +204,20 @@ void reschedule(struct task_struct *p)
unsigned long flags;
if (p->has_cpu)
- return;
+ return;
spin_lock_irqsave(&schedule_data[cpu].lock, flags);
curr = schedule_data[cpu].curr;
- if (is_idle_task(curr)) {
+
+ if ( is_idle_task(curr) ||
+ (p->evt < curr->evt) ||
+ (curr->lastschd + ctx_allow >= NOW()) ) {
+ /* reschedule */
set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
#ifdef CONFIG_SMP
if (cpu != smp_processor_id())
- smp_send_event_check_cpu(cpu);
+ smp_send_event_check_cpu(cpu);
#endif
} else {
spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
@@ -209,47 +225,154 @@ void reschedule(struct task_struct *p)
}
-/*
- * Pick the next domain to run
- */
-
+/****************************************************************************
+ * The main function
+ * - deschedule the current domain.
+ * - pick a new domain.
+ * i.e., the domain with lowest EVT.
+ * The runqueue should be ordered by EVT so that is easy.
+ ****************************************************************************/
asmlinkage void schedule(void)
{
- struct task_struct *prev, *next, *p;
- struct list_head *tmp;
- int this_cpu;
-
+ struct task_struct *prev, *next, *next_prime, *p;
+ struct list_head *tmp;
+ int this_cpu;
+ s_time_t now;
+ s32 r_time; /* time for new dom to run */
+ s32 ranfor; /* assume we never run longer than 2.1s! */
+ s32 mcus;
+ u32 next_evt, next_prime_evt, min_avt;
+
+ perfc_incrc(sched_run1);
need_resched_back:
+ perfc_incrc(sched_run2);
+
+ now = NOW();
+ next = NULL;
prev = current;
this_cpu = prev->processor;
+ /* remove timer */
+ rem_ac_timer(&schedule_data[this_cpu].s_timer);
+
+ /*
+ * deschedule the current domain
+ */
+
spin_lock_irq(&schedule_data[this_cpu].lock);
ASSERT(!in_interrupt());
ASSERT(__task_on_runqueue(prev));
- __move_last_runqueue(prev);
+ if (is_idle_task(prev))
+ goto deschedule_done;
- switch ( prev->state )
- {
+ /* do some accounting */
+ ranfor = (s32)(now - prev->lastschd);
+ ASSERT((ranfor>0));
+ prev->cpu_time += ranfor;
+
+ /* calculate mcu and update avt */
+ mcus = ranfor/MCU;
+ if (ranfor % MCU) mcus ++; /* always round up */
+ prev->avt += mcus * prev->mcu_advance;
+ prev->evt = prev->avt; /* RN: XXX BVT deal with warping here */
+
+ /* dequeue */
+ __del_from_runqueue(prev);
+ switch (prev->state) {
case TASK_INTERRUPTIBLE:
- if ( signal_pending(prev) )
- {
- prev->state = TASK_RUNNING;
+ if (signal_pending(prev)) {
+ prev->state = TASK_RUNNING; /* but has events pending */
break;
}
+ case TASK_UNINTERRUPTIBLE:
+ case TASK_WAIT:
+ case TASK_DYING:
default:
- __del_from_runqueue(prev);
+ /* done if not running. Else, continue */
+ goto deschedule_done;
case TASK_RUNNING:;
}
+
+ /* requeue */
+ __add_to_runqueue_tail(prev);
+
+
+ deschedule_done:
clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
- next = NULL;
- list_for_each(tmp, &schedule_data[smp_processor_id()].runqueue) {
+ /*
+ * Pick a new domain
+ */
+
+ /* we should at least have the idle task */
+ ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
+
+ /*
+ * scan through the run queue and pick the task with the lowest evt
+ * *and* the task the second lowest evt.
+ * this code is O(n) but we expect n to be small.
+ */
+ next = schedule_data[this_cpu].idle;
+ next_prime = NULL;
+
+ next_evt = 0xffffffff;
+ next_prime_evt = 0xffffffff;
+ min_avt = 0xffffffff; /* to calculate svt */
+
+
+ list_for_each(tmp, &schedule_data[this_cpu].runqueue) {
p = list_entry(tmp, struct task_struct, run_list);
- next = p;
- if ( !is_idle_task(next) ) break;
+ if (p->evt < next_evt) {
+ next_prime = next;
+ next_prime_evt = next_evt;
+ next = p;
+ next_evt = p->evt;
+ } else if (next_prime_evt == 0xffffffff) {
+ next_prime_evt = p->evt;
+ next_prime = p;
+ } else if (p->evt < next_prime_evt) {
+ next_prime_evt = p->evt;
+ next_prime = p;
+ }
+ /* determine system virtual time */
+ if (p->avt < min_avt)
+ min_avt = p->avt;
}
+ ASSERT(next != NULL); /* we should have at least the idle task */
+
+ /* update system virtual time */
+ if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt;
+
+ if (is_idle_task(next)) {
+ r_time = ctx_allow;
+ goto sched_done;
+ }
+
+ if (next_prime == NULL || is_idle_task(next_prime)) {
+ /* we have only one runable task besides the idle task */
+ r_time = 10 * ctx_allow; /* RN: random constant */
+ goto sched_done;
+ }
+
+ /*
+ * if we are here we have two runable tasks.
+ * work out how long 'next' can run till its evt is greater than
+ * 'next_prime's evt. Taking context switch allowance into account.
+ */
+ ASSERT(next_prime->evt > next->evt);
+ r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
+
+ sched_done:
+ ASSERT(r_time != 0);
+ ASSERT(r_time > ctx_allow);
+
+ if ( (r_time==0) || (r_time < ctx_allow)) {
+ printk("[%02d]: %lx\n", this_cpu, r_time);
+ dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
+ }
+
prev->has_cpu = 0;
next->has_cpu = 1;
@@ -257,6 +380,17 @@ asmlinkage void schedule(void)
schedule_data[this_cpu].prev = prev;
schedule_data[this_cpu].curr = next;
+ next->lastschd = now;
+
+ /* reprogramm the timer */
+ timer_redo:
+ schedule_data[this_cpu].s_timer.expires = now + r_time;
+ if (add_ac_timer(&schedule_data[this_cpu].s_timer) == 1) {
+ printk("SCHED[%02d]: Shit this shouldn't happen\n", this_cpu);
+ now = NOW();
+ goto timer_redo;
+ }
+
spin_unlock_irq(&schedule_data[this_cpu].lock);
if ( unlikely(prev == next) )
@@ -266,6 +400,8 @@ asmlinkage void schedule(void)
goto same_process;
}
+ perfc_incrc(sched_ctx);
+
prepare_to_switch();
switch_to(prev, next);
prev = schedule_data[this_cpu].prev;
@@ -274,67 +410,56 @@ asmlinkage void schedule(void)
if ( prev->state == TASK_DYING ) release_task(prev);
same_process:
+ /* update the domains notion of time */
update_dom_time(current->shared_info);
- if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) )
+ if ( test_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events) ) {
goto need_resched_back;
+ }
return;
}
/*
- * The scheduling timer.
+ * The scheduler timer.
*/
-static __cacheline_aligned int count[NR_CPUS];
static void sched_timer(unsigned long foo)
{
- int cpu = smp_processor_id();
+ int cpu = smp_processor_id();
struct task_struct *curr = schedule_data[cpu].curr;
- s_time_t now;
- int res;
-
- /* reschedule after each 5 ticks */
- if (count[cpu] >= 5) {
- set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
- count[cpu] = 0;
- }
- count[cpu]++;
+ /* cause a reschedule */
+ set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
+ perfc_incrc(sched_irq);
+}
- /*
- * deliver virtual timer interrups to domains if we are CPU 0 XXX RN: We
- * don't have a per CPU list of domains yet. Otherwise would use that.
- * Plus, this should be removed anyway once Domains "know" about virtual
- * time and timeouts. But, it's better here then where it was before.
- */
- if (cpu == 0) {
- struct task_struct *p;
- unsigned long cpu_mask = 0;
-
- /* send virtual timer interrupt */
- read_lock(&tasklist_lock);
- p = &idle0_task;
- do {
- if ( is_idle_task(p) ) continue;
- cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
- }
- while ( (p = p->next_task) != &idle0_task );
- read_unlock(&tasklist_lock);
- guest_event_notify(cpu_mask);
+/*
+ * The Domain virtual time timer
+ */
+static void virt_timer(unsigned long foo)
+{
+ unsigned long cpu_mask = 0;
+ struct task_struct *p;
+ s_time_t now;
+ int res;
+
+ /* send virtual timer interrupt */
+ read_lock(&tasklist_lock);
+ p = &idle0_task;
+ do {
+ if ( is_idle_task(p) ) continue;
+ cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
}
+ while ( (p = p->next_task) != &idle0_task );
+ read_unlock(&tasklist_lock);
+ guest_event_notify(cpu_mask);
- again:
+ again:
now = NOW();
- s_timer[cpu].expires = now + MILLISECS(10);
- res=add_ac_timer(&s_timer[cpu]);
-
- TRC(printk("SCHED[%02d] timer(): now=0x%08X%08X timo=0x%08X%08X\n",
- cpu, (u32)(now>>32), (u32)now,
- (u32)(s_timer[cpu].expires>>32), (u32)s_timer[cpu].expires));
+ v_timer.expires = now + MILLISECS(10);
+ res=add_ac_timer(&v_timer);
if (res==1)
goto again;
-
}
-
/*
* Initialise the data structures
*/
@@ -350,11 +475,15 @@ void __init scheduler_init(void)
spin_lock_init(&schedule_data[i].lock);
schedule_data[i].prev = &idle0_task;
schedule_data[i].curr = &idle0_task;
-
+
/* a timer for each CPU */
- init_ac_timer(&s_timer[i]);
- s_timer[i].function = &sched_timer;
+ init_ac_timer(&schedule_data[i].s_timer);
+ schedule_data[i].s_timer.function = &sched_timer;
+
}
+ schedule_data[0].idle = &idle0_task; /* idle on CPU 0 is special */
+ init_ac_timer(&v_timer);
+ v_timer.function = &virt_timer;
}
/*
@@ -362,10 +491,121 @@ void __init scheduler_init(void)
* This has to be done *after* the timers, e.g., APICs, have been initialised
*/
void schedulers_start(void)
-{
+{
printk("Start schedulers\n");
__cli();
sched_timer(0);
+ virt_timer(0);
smp_call_function((void *)sched_timer, NULL, 1, 1);
__sti();
}
+
+
+/****************************************************************************
+ * Functions for legacy support.
+ * Schedule timeout is used at a number of places and is a bit meaningless
+ * in the context of Xen, as Domains are not able to call these and all
+ * there entry points into Xen should be asynchronous. If a domain wishes
+ * to block for a while it should use Xen's sched_op entry point.
+ ****************************************************************************/
+
+static void process_timeout(unsigned long __data)
+{
+ struct task_struct * p = (struct task_struct *) __data;
+ wake_up(p);
+}
+
+long schedule_timeout(long timeout)
+{
+ struct timer_list timer;
+ unsigned long expire;
+
+ switch (timeout)
+ {
+ case MAX_SCHEDULE_TIMEOUT:
+ /*
+ * These two special cases are useful to be comfortable in the caller.
+ * Nothing more. We could take MAX_SCHEDULE_TIMEOUT from one of the
+ * negative value but I' d like to return a valid offset (>=0) to allow
+ * the caller to do everything it want with the retval.
+ */
+ schedule();
+ goto out;
+ default:
+ /*
+ * Another bit of PARANOID. Note that the retval will be 0 since no
+ * piece of kernel is supposed to do a check for a negative retval of
+ * schedule_timeout() (since it should never happens anyway). You just
+ * have the printk() that will tell you if something is gone wrong and
+ * where.
+ */
+ if (timeout < 0)
+ {
+ printk(KERN_ERR "schedule_timeout: wrong timeout "
+ "value %lx from %p\n", timeout,
+ __builtin_return_address(0));
+ current->state = TASK_RUNNING;
+ goto out;
+ }
+ }
+
+ expire = timeout + jiffies;
+
+ init_timer(&timer);
+ timer.expires = expire;
+ timer.data = (unsigned long) current;
+ timer.function = process_timeout;
+
+ add_timer(&timer);
+ schedule();
+ del_timer_sync(&timer);
+
+ timeout = expire - jiffies;
+
+ out:
+ return timeout < 0 ? 0 : timeout;
+}
+
+/****************************************************************************
+ * debug function
+ ****************************************************************************/
+
+static void dump_rqueue(struct list_head *queue, char *name)
+{
+ struct list_head *list;
+ int loop = 0;
+ struct task_struct *p;
+
+ printk ("QUEUE %s %lx n: %lx, p: %lx\n", name, (unsigned long)queue,
+ (unsigned long) queue->next, (unsigned long) queue->prev);
+ list_for_each (list, queue) {
+ p = list_entry(list, struct task_struct, run_list);
+ printk("%3d: %3d has=%c mcua=0x%04X ev=0x%08X av=0x%08X c=0x%X%08X\n",
+ loop++, p->domain,
+ p->has_cpu ? 'T':'F',
+ p->mcu_advance, p->evt, p->avt,
+ (u32)(p->cpu_time>>32), (u32)p->cpu_time);
+ printk(" l: %lx n: %lx p: %lx\n",
+ (unsigned long)list, (unsigned long)list->next,
+ (unsigned long)list->prev);
+ }
+ return;
+}
+
+void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
+{
+ u_long flags;
+ s_time_t now = NOW();
+ int i;
+
+ printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
+ (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now);
+ for (i = 0; i < smp_num_cpus; i++) {
+ spin_lock_irqsave(&schedule_data[i].lock, flags);
+ printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
+ dump_rqueue(&schedule_data[i].runqueue, "rq");
+ spin_unlock_irqrestore(&schedule_data[i].lock, flags);
+ }
+ return;
+}
+
diff --git a/xen/include/xeno/ac_timer.h b/xen/include/xeno/ac_timer.h
index 7cf568d2fc..280f377d17 100644
--- a/xen/include/xeno/ac_timer.h
+++ b/xen/include/xeno/ac_timer.h
@@ -43,10 +43,10 @@
*/
struct ac_timer {
- struct list_head timer_list;
- s_time_t expires; /* system time time out value */
- unsigned long data;
- void (*function)(unsigned long);
+ struct list_head timer_list;
+ s_time_t expires; /* system time time out value */
+ unsigned long data;
+ void (*function)(unsigned long);
};
/* interface for "clients" */
@@ -55,7 +55,7 @@ extern int rem_ac_timer(struct ac_timer *timer);
extern int mod_ac_timer(struct ac_timer *timer, s_time_t new_time);
static inline void init_ac_timer(struct ac_timer *timer)
{
- //timer->next = NULL;
+ timer->timer_list.next = NULL;
}
/* interface used by programmable timer, implemented hardware dependent */
diff --git a/xen/include/xeno/dom0_ops.h b/xen/include/xeno/dom0_ops.h
index 5e498de1bc..c0159d12cc 100644
--- a/xen/include/xeno/dom0_ops.h
+++ b/xen/include/xeno/dom0_ops.h
@@ -4,8 +4,11 @@
* Process command requests from domain-0 guest OS.
*
* Copyright (c) 2002, K A Fraser, B Dragovic
+ *
+ * MUST BE KEPT IN SYNC WITH xenolinux<*>/arch/xeno/drivers/dom0/dom0_ops.h
*/
+
#ifndef __DOM0_OPS_H__
#define __DOM0_OPS_H__
@@ -13,6 +16,8 @@
#define DOM0_KILLDOMAIN 1
#define DOM0_GETMEMLIST 2
#define DOM0_STARTDOM 4
+#define DOM0_BVTCTL 6
+#define DOM0_ADJUSTDOM 7
#define MAX_CMD_LEN 256
@@ -48,6 +53,20 @@ typedef struct domain_launch
char cmd_line[MAX_CMD_LEN];
} dom_meminfo_t;
+typedef struct dom0_bvtctl_st
+{
+ unsigned long ctx_allow; /* context switch allowance */
+} dom0_bvtctl_t;
+
+typedef struct dom0_adjustdom_st
+{
+ unsigned int domain; /* domain id */
+ unsigned long mcu_adv; /* mcu advance: inverse of weight */
+ unsigned long warp; /* time warp */
+ unsigned long warpl; /* warp limit */
+ unsigned long warpu; /* unwarp time requirement */
+} dom0_adjustdom_t;
+
typedef struct dom0_op_st
{
unsigned long cmd;
@@ -56,6 +75,8 @@ typedef struct dom0_op_st
dom0_newdomain_t newdomain;
dom0_killdomain_t killdomain;
dom0_getmemlist_t getmemlist;
+ dom0_bvtctl_t bvtctl;
+ dom0_adjustdom_t adjustdom;
dom_meminfo_t meminfo;
}
u;
diff --git a/xen/include/xeno/perfc_defn.h b/xen/include/xeno/perfc_defn.h
index 41dd48f73b..f006079085 100644
--- a/xen/include/xeno/perfc_defn.h
+++ b/xen/include/xeno/perfc_defn.h
@@ -1,10 +1,13 @@
-
PERFCOUNTER_CPU( irqs, "#interrupts" )
PERFCOUNTER_CPU( irq_time, "cycles spent in irq handler" )
-PERFCOUNTER( blockio_tx, "block io: messages received from tx queue" )
-PERFCOUNTER( blockio_rx, "block io: messages sent on rx queue" )
+PERFCOUNTER( blockio_tx, "block io: messages received from tx queue" )
+PERFCOUNTER( blockio_rx, "block io: messages sent on rx queue" )
-PERFCOUNTER_CPU( apic_timer, "apic timer interrupts" )
-PERFCOUNTER_CPU( ac_timer_max, "ac_timer max error" )
+PERFCOUNTER_CPU( apic_timer, "apic timer interrupts" )
+PERFCOUNTER_CPU( ac_timer_max, "ac_timer max error (ns)" )
+PERFCOUNTER_CPU( sched_irq, "sched: timer" )
+PERFCOUNTER_CPU( sched_run1, "sched: calls to schedule" )
+PERFCOUNTER_CPU( sched_run2, "sched: runs through scheduler" )
+PERFCOUNTER_CPU( sched_ctx, "sched: context switches" )
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index 6d1842a2ea..dbbf6a927e 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -12,6 +12,10 @@
#include <hypervisor-ifs/hypervisor-if.h>
#include <xeno/dom0_ops.h>
+#include <xeno/list.h>
+#include <xeno/time.h>
+#include <xeno/ac_timer.h>
+
extern unsigned long volatile jiffies;
extern rwlock_t tasklist_lock;
@@ -59,18 +63,48 @@ extern struct mm_struct init_mm;
struct task_struct {
- int processor;
- int state;
- int hyp_events;
- unsigned int domain;
+ /*
+ * DO NOT CHANGE THE ORDER OF THE FOLLOWING.
+ * There offsets are hardcoded in entry.S
+ */
+
+ int processor; /* 00: current processor */
+ int state; /* 04: current run state */
+ int hyp_events; /* 08: pending events */
+ unsigned int domain; /* 12: domain id */
/* An unsafe pointer into a shared data area. */
- shared_info_t *shared_info;
+ shared_info_t *shared_info; /* 16: shared data area */
+
+ /*
+ * From here on things can be added and shuffled without special attention
+ */
struct list_head pg_head;
unsigned int tot_pages; /* number of pages currently possesed */
unsigned int max_pages; /* max number of pages that can be possesed */
+ /* scheduling */
+ struct list_head run_list; /* the run list */
+ int has_cpu;
+ int policy;
+ int counter;
+
+ struct ac_timer blt; /* blocked timeout */
+
+ s_time_t lastschd; /* time this domain was last scheduled */
+ s_time_t cpu_time; /* total CPU time received till now */
+
+ unsigned long mcu_advance; /* inverse of weight */
+ s32 avt; /* actual virtual time */
+ s32 evt; /* effective virtual time */
+ long warp; /* virtual time warp */
+ long warpl; /* warp limit */
+ long warpu; /* unwarp time requirement */
+ long warped; /* time it ran warped last time */
+ long uwarped; /* time it ran unwarped last time */
+
+
/* Network I/O */
net_ring_t *net_ring_base;
net_vif_t *net_vif_list[MAX_GUEST_VIFS];
@@ -85,10 +119,7 @@ struct task_struct {
segment_t *segment_list[XEN_MAX_SEGMENTS]; /* vhd */
int segment_count;
- int has_cpu, policy, counter;
-
- struct list_head run_list;
-
+ /* VM */
struct mm_struct mm;
/* We need this lock to check page types and frob reference counts. */
spinlock_t page_lock;
@@ -127,7 +158,7 @@ struct task_struct {
#define TASK_RUNNING 0
#define TASK_INTERRUPTIBLE 1
#define TASK_UNINTERRUPTIBLE 2
-#define TASK_WAIT 4
+#define TASK_WAIT 4
#define TASK_DYING 16
/* #define TASK_STOPPED 8 not really used */
@@ -141,6 +172,8 @@ struct task_struct {
domain: IDLE_DOMAIN_ID, \
state: TASK_RUNNING, \
has_cpu: 0, \
+ evt: 0xffffffff, \
+ avt: 0xffffffff, \
mm: IDLE0_MM, \
addr_limit: KERNEL_DS, \
active_mm: &idle0_task.mm, \
@@ -153,7 +186,7 @@ struct task_struct {
#define is_idle_task(_p) ((_p)->domain == IDLE_DOMAIN_ID)
#ifndef IDLE0_TASK_SIZE
-#define IDLE0_TASK_SIZE 2048*sizeof(long)
+#define IDLE0_TASK_SIZE 2048*sizeof(long)
#endif
union task_union {
@@ -202,6 +235,9 @@ void scheduler_init(void);
void schedulers_start(void);
void sched_add_domain(struct task_struct *p);
void sched_rem_domain(struct task_struct *p);
+long sched_bvtctl(unsigned long ctx_allow);
+long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
+ unsigned long warpl, unsigned long warpu);
int wake_up(struct task_struct *p);
long schedule_timeout(long timeout);
long do_yield(void);
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile
index 4738fc0ba4..eeb3413842 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/Makefile
@@ -1,3 +1,3 @@
O_TARGET := dom0.o
-obj-y := dom0_memory.o dom0_core.o vfr.o
+obj-y := dom0_memory.o dom0_core.o vfr.o sched_ops.o
include $(TOPDIR)/Rules.make
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
index 6c60a93ff6..22ebd7aba0 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/dom0_ops.h
@@ -4,15 +4,20 @@
* Process command requests from domain-0 guest OS.
*
* Copyright (c) 2002, K A Fraser, B Dragovic
+ *
+ * MUST BE KEPT IN SYNC WITH xen/include/xeno/dom0_ops.h
+ * MUST BE KEPT IN SYNC WITH tools/domain_builder/dom0_ops.h
*/
#define DOM0_NEWDOMAIN 0
#define DOM0_KILLDOMAIN 1
#define DOM0_GETMEMLIST 2
#define DOM0_STARTDOM 4
-#define MAP_DOM_MEM 6 /* Not passed down to Xen */
-#define DO_PGUPDATES 7 /* Not passed down to Xen */
-#define MAX_CMD 8
+#define DOM0_BVTCTL 6
+#define DOM0_ADJUSTDOM 7
+#define MAP_DOM_MEM 8 /* Not passed down to Xen */
+#define DO_PGUPDATES 9 /* Not passed down to Xen */
+#define MAX_CMD 10
#define MAX_CMD_LEN 256
@@ -20,8 +25,8 @@ typedef struct dom0_newdomain_st
{
unsigned int domain;
unsigned int memory_kb;
- unsigned int num_vifs; // temporary
- unsigned long pg_head; // return parameter
+ unsigned int num_vifs; /* temporary */
+ unsigned long pg_head; /* return parameter */
} dom0_newdomain_t;
typedef struct dom0_killdomain_st
@@ -37,6 +42,20 @@ typedef struct dom0_getmemlist_st
void *buffer;
} dom0_getmemlist_t;
+typedef struct dom0_bvtctl_st
+{
+ unsigned long ctx_allow; /* context switch allowance */
+} dom0_bvtctl_t;
+
+typedef struct dom0_adjustdom_st
+{
+ unsigned int domain; /* domain id */
+ unsigned long mcu_adv; /* mcu advance: inverse of weight */
+ unsigned long warp; /* time warp */
+ unsigned long warpl; /* warp limit */
+ unsigned long warpu; /* unwarp time requirement */
+} dom0_adjustdom_t;
+
/* This is entirely processed by XenoLinux */
typedef struct dom_mem
{
@@ -64,6 +83,8 @@ typedef struct domain_launch
char cmd_line[MAX_CMD_LEN];
} dom_meminfo_t;
+
+
typedef struct dom0_op_st
{
unsigned long cmd;
@@ -72,6 +93,8 @@ typedef struct dom0_op_st
dom0_newdomain_t newdomain;
dom0_killdomain_t killdomain;
dom0_getmemlist_t getmemlist;
+ dom0_bvtctl_t bvtctl;
+ dom0_adjustdom_t adjustdom;
dom_mem_t dommem;
dom_pgupdate_t pgupdate;
dom_meminfo_t meminfo;
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c
new file mode 100644
index 0000000000..9c5fce7857
--- /dev/null
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/dom0/sched_ops.c
@@ -0,0 +1,137 @@
+/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
+ ****************************************************************************
+ * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
+ ****************************************************************************
+ *
+ * File: sched_ops.c
+ * Author: Rolf Neugebauer (neugebar@dcs.gla.ac.uk)
+ * Changes:
+ *
+ * Date: Mar 2003
+ *
+ * Environment: XenoLinux
+ * Description: Dom0 Control interface to scheduler in Xen
+ *
+ * code based on Andy's vfr parsing code
+ *
+ * Commands understood by the interface:
+ *
+ * C <context swith allowance>
+ * S <did> <mcu advance> <warp> <warp limit> <unwarp limit>
+ *
+ ****************************************************************************
+ * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
+ ****************************************************************************
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+
+#include "dom0_ops.h"
+
+#define SCHED_ENTRY "sched"
+extern struct proc_dir_entry *xeno_base;
+static struct proc_dir_entry *sched_pde;
+
+static unsigned char readbuf[1024];
+
+static int sched_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ strcpy(page, readbuf);
+ *readbuf = '\0';
+ *eof = 1;
+ *start = page;
+ return strlen(page);
+}
+
+
+static int sched_write_proc(struct file *file, const char *buffer,
+ u_long count, void *data)
+{
+ dom0_op_t op;
+
+ int ret, len;
+ int ts, te, tl; /* token start, end, and length */
+
+ /* Only admin can adjust scheduling parameters */
+ if ( !capable(CAP_SYS_ADMIN) )
+ return -EPERM;
+
+ /* parse the commands */
+ len = count;
+ ts = te = 0;
+
+ while ( count && isspace(buffer[ts]) ) { ts++; count--; } /*skip spaces*/
+ te = ts;
+ while ( count && !isspace(buffer[te]) ) { te++; count--; } /*command end*/
+ if ( te <= ts ) goto bad;
+ tl = te - ts;
+
+ if ( strncmp(&buffer[ts], "C", tl) == 0 ) {
+ op.cmd = DOM0_BVTCTL;
+ } else if ( strncmp(&buffer[ts], "S", tl) == 0 ) {
+ op.cmd = DOM0_ADJUSTDOM;
+ } else
+ goto bad;
+
+ /* skip whitspaces and get first parameter */
+ ts = te; while ( count && isspace(buffer[ts]) ) { ts++; count--; }
+ te = ts; while ( count && !isspace(buffer[te]) ) { te++; count--; }
+ if ( te <= ts ) goto bad;
+ tl = te - ts;
+ if ( !isdigit(buffer[ts]) ) goto bad;
+
+ if (op.cmd == DOM0_BVTCTL) {
+ /* get context switch allowance */
+ sscanf(&buffer[ts], "%lu", &op.u.bvtctl.ctx_allow);
+ } else if (op.cmd == DOM0_ADJUSTDOM) {
+ sscanf(&buffer[ts], "%u %lu %lu %lu %lu",
+ &op.u.adjustdom.domain,
+ &op.u.adjustdom.mcu_adv,
+ &op.u.adjustdom.warp,
+ &op.u.adjustdom.warpl,
+ &op.u.adjustdom.warpu);
+ }
+ ret = HYPERVISOR_dom0_op(&op);
+ return sizeof(op);
+
+ bad:
+ return -EINVAL;
+
+}
+
+
+/*
+ * main scheduler interface driver driver initialization function.
+ */
+static int __init init_module(void)
+{
+ printk(KERN_ALERT "Starting Domain Scheduler Control Interface\n");
+
+ sched_pde = create_proc_entry(SCHED_ENTRY, 0600, xeno_base);
+ if ( sched_pde == NULL )
+ {
+ printk(KERN_ALERT "Unable to create dom scheduler proc entry!");
+ return -1;
+ }
+
+ sched_pde->read_proc = sched_read_proc;
+ sched_pde->write_proc = sched_write_proc;
+
+ return 0;
+}
+
+static void __exit cleanup_module(void)
+{
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
+