bitkeeper revision 1.808 (4058996anVCLQRr3o_Adf9GqJybYSg)

Various updates related to the new generic scheduler API. The BVT scheduler has been ported to this API and a simple Round Robin scheduler has been added. There's a new generic control interface for setting scheduling parameters from userspace. Use the sched=xxx option at boot time to choose the scheduler. Default is BVT. The possibilities are "bvt" and "rrobin".
author: mwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net> 2004-03-17 18:31:06 +0000
committer: mwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net> 2004-03-17 18:31:06 +0000
commit: bee5b0bb130f42dabd8cbdcd035d8f737e725dbc (patch)
tree: 99ac0cc05ceea17ead1d618190f88dfa33ea7f86
parent: 8306baac6f817aea60eb6e7acfac96cbb007ed5a (diff)
download: xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.tar.gz
xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.tar.bz2
xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.zip
19 files changed, 1339 insertions, 307 deletions
diff --git a/.rootkeys b/.rootkeys
index b000ea2a90..d302f37329 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -71,6 +71,7 @@
 3fbba6dbDfYvJSsw9500b4SZyUhxjQ tools/xc/lib/Makefile
 3fbba6dc1uU7U3IFeF6A-XEOYF2MkQ tools/xc/lib/rpm.spec
 3fbba6dcrNxtygEcgJYAJJ1gCQqfsA tools/xc/lib/xc.h
+40589968oCfoUlXd460CjVAkBE8IBA tools/xc/lib/xc_atropos.c
 3fbba6dbEVkVMX0JuDFzap9jeaucGA tools/xc/lib/xc_bvtsched.c
 3fbba6dbasJQV-MVElDC0DGSHMiL5w tools/xc/lib/xc_domain.c
 40278d99BLsfUv3qxv0I8C1sClZ0ow tools/xc/lib/xc_elf.h
@@ -83,6 +84,7 @@
 4051bce6CHAsYh8P5t2OHDtRWOP9og tools/xc/lib/xc_physdev.c
 3fbba6dctWRWlFJkYb6hdix2X4WMuw tools/xc/lib/xc_private.c
 3fbba6dcbVrG2hPzEzwdeV_UC8kydQ tools/xc/lib/xc_private.h
+40589968UQFnJeOMn8UIFLbXBuwXjw tools/xc/lib/xc_rrobin.c
 3fbba6dcoGq9hQlksrBUfC2P5F6sGg tools/xc/lib/xc_vbd.c
 3fbba6dc38q-ioRlwSR_quw4G3qUeQ tools/xc/lib/xc_vif.c
 3fbd0a3dTwnDcfdw0-v46dPbX98zDw tools/xc/py/Makefile
@@ -169,6 +171,8 @@
 4051bcecFeq4DE70p4zGO5setf47CA xen/common/physdev.c
 4006e659i9j-doVxY7DKOGU4XVin1Q xen/common/rbtree.c
 3ddb79bdHqdQpATqC0rmUZNbsb6L6A xen/common/resource.c
+40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
+40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
 3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
 3ddb79bdB9RNMnkQnUyZ5C9hhMSQQw xen/common/slab.c
 3ddb79bd0gVQYmL2zvuJnldvD0AGxQ xen/common/softirq.c
@@ -497,6 +501,7 @@
 3ead095dE_VF-QA88rl_5cWYRWtRVQ xen/include/hypervisor-ifs/kbd.h
 3ddb79c2oRPrzClk3zbTkRHlpumzKA xen/include/hypervisor-ifs/network.h
 4051db79512nOCGweabrFWO2M2h5ng xen/include/hypervisor-ifs/physdev.h
+40589968wmhPmV5-ENbBYmMjnedgKw xen/include/hypervisor-ifs/sched-ctl.h
 404f3d2eR2Owk-ZcGOx9ULGHg3nrww xen/include/hypervisor-ifs/trace.h
 3f0d22cbroqp_BkoDPwkfRJhaw1LiQ xen/include/hypervisor-ifs/vbd.h
 3ddb79c4qbCoOFHrv9sCGshbWzBVlQ xen/include/scsi/scsi.h
@@ -562,6 +567,7 @@
 3ddb79c04nQVR3EYM5L4zxDV_MCo1g xen/include/xeno/prefetch.h
 4006e65fWMwLqcocgik6wbF0Eeh0Og xen/include/xeno/rbtree.h
 3e4540ccU1sgCx8seIMGlahmMfv7yQ xen/include/xeno/reboot.h
+40589969nPq3DMzv24RDb5LXE9brHw xen/include/xeno/sched-if.h
 3ddb79c0LzqqS0LhAQ50ekgj4oGl7Q xen/include/xeno/sched.h
 403a06a7H0hpHcKpAiDe5BPnaXWTlA xen/include/xeno/serial.h
 3ddb79c0VDeD-Oft5eNfMneTU3D1dQ xen/include/xeno/skbuff.h
diff --git a/docs/interface.tex b/docs/interface.tex
index 2736a0412d..84003de1b6 100644
--- a/docs/interface.tex
+++ b/docs/interface.tex
@@ -353,7 +353,7 @@ create ``virtual disks'' on demand.
 \subsection{Virtual Disk Management}
 The VD management code consists of a set of python libraries. It can therefore
 be accessed by custom scripts as well as the convenience scripts provided. The
-VD database is a SQLite database in /var/db/xen\_vdisk.sqlite.
+VD database is a SQLite database in /var/db/xen\_vdisks.sqlite.
 
 The VD scripts and general VD usage are documented in the VBD-HOWTO.txt.
 
@@ -379,6 +379,307 @@ giving the page back to the hypervisor, or to use them for storing page tables.
 and providing control interfaces for managing scheduling, networking, and
 blocks.
 
+\chapter{CPU Scheduler}
+
+Xen offers a uniform API for CPU schedulers.  It is possible to choose
+from a number of schedulers at boot and it should be easy to add more.
+
+\paragraph*{Note: SMP host support}
+Xen has always supported SMP host systems.  Domains are statically assigned to
+CPUs, either at creation time or when manually pinning to a particular CPU.
+The current schedulers then run locally on each CPU to decide which of the
+assigned domains should be run there.
+
+\section{Standard Schedulers}
+
+These BVT and Round Robin schedulers are part of the normal Xen
+distribution.  A port of the Atropos scheduler from the Nemesis
+operating system is almost complete and will be added shortly.
+
+\subsection{Borrowed Virtual Time (BVT)}
+
+This was the original Xen scheduler.  BVT is designed for general-purpose
+environments but also provides support for latency-sensitive threads.  It
+provides long-term weighted sharing but allows tasks a limited ability to
+``warp back'' in virtual time so that they are dispatched earlier.
+
+BVT can be activated by specifying {\tt sched=bvt} as a boot argument to Xen.
+
+\subsection{Round Robin}
+
+The round robin scheduler is a very simple example of some of the basic parts
+of the scheduler API.
+
+Round robin can be activated by specifying {\tt sched=rrobin} as a boot
+argument to Xen.
+
+\section{Scheduling API}
+
+The scheduling API is used by both the schedulers described above and should
+also be used by any new schedulers.  It provides a generic interface and also
+implements much of the ``boilerplate'' code.
+
+\paragraph*{Note:} the scheduler API is currently undergoing active development,
+so there may be some changes to this API, although they are expected to be small.
+
+Schedulers conforming to this API are described by the following
+structure:
+
+\begin{verbatim}
+struct scheduler
+{
+    char *name;             /* full name for this scheduler      */
+    char *opt_name;         /* option name for this scheduler    */
+    unsigned int sched_id;  /* ID for this scheduler             */
+
+    int          (*init_scheduler) ();
+    int          (*alloc_task)     (struct task_struct *);
+    void         (*add_task)       (struct task_struct *);
+    void         (*free_task)      (struct task_struct *);
+    void         (*rem_task)       (struct task_struct *);
+    void         (*wake_up)        (struct task_struct *);
+    long         (*do_block)       (struct task_struct *);
+    task_slice_t (*do_schedule)    (s_time_t);
+    int          (*control)        (struct sched_ctl_cmd *);
+    int          (*adjdom)         (struct task_struct *,
+                                    struct sched_adjdom_cmd *);
+    s32          (*reschedule)     (struct task_struct *);
+    void         (*dump_settings)  (void);
+    void         (*dump_cpu_state) (int);
+    void         (*dump_runq_el)   (struct task_struct *);
+};
+\end{verbatim}
+
+The only method that {\em must} be implemented is
+{\tt do\_schedule()}.  However, if there is not some implementation for the
+{\tt wake\_up()} method then waking tasks will not get put on the runqueue!
+
+The fields of the above structure are described in more detail below.
+
+\subsubsection{name}
+
+The name field is an arbitrary descriptive ASCII string.
+
+\subsubsection{opt\_name}
+
+This field is the value of the {\tt sched=} boot-time option that will select
+this scheduler.
+
+\subsubsection{sched\_id}
+
+This is an integer that uniquely identifies this scheduler.  There should be a
+macro corrsponding to this scheduler ID in {\tt <hypervisor-ifs/sched-if.h>}.
+
+\subsubsection{init\_scheduler}
+
+\paragraph*{Purpose}
+
+This is a function for performing any scheduler-specific initialisation.  For
+instance, it might allocate memory for per-CPU scheduler data and initialise it
+appropriately.
+
+\paragraph*{Call environment}
+
+This function is called after the initialisation performed by the generic
+layer.  The function is called exactly once, for the scheduler that has been
+selected.
+
+\paragraph*{Return values}
+
+This should return negative on failure --- failure to initialise the scheduler
+will cause an immediate panic.
+
+\subsubsection{alloc\_task}
+
+\paragraph*{Purpose}
+This is called when a {\tt task\_struct} is allocated by the generic scheduler
+layer.  A particular scheduler implementation may use this method to allocate
+per-task data for this task.  It may use the {\tt sched\_priv} pointer in the
+{\tt task\_struct} to point to this data.
+
+\paragraph*{Call environment}
+The generic layer guarantees that the {\tt sched\_priv} field will
+remain intact from the time this method is called until the task is
+deallocated (so long as the scheduler implementation does not change
+it!).
+
+\paragraph*{Return values}
+Negative on failure.
+
+\subsubsection{add\_task}
+
+\paragraph*{Purpose}
+
+Called when a task is initially added by the generic layer.
+
+\paragraph*{Call environment}
+
+The fields in the {\tt task\_struct} are now filled out and available for use.
+Schedulers should implement appropriate initialisation of any per-task private
+information in this method.
+
+\subsubsection{free\_task}
+
+\paragraph*{Purpose}
+
+Schedulers should free the space used by any associated private data
+structures.
+
+\paragraph*{Call environment}
+
+This is called when a {\tt task\_struct} is about to be deallocated.
+The generic layer will have done generic task removal operations and
+(if implemented) called the scheduler's {\tt rem\_task} method before
+this method is called.
+
+\subsubsection{rem\_task}
+
+\paragraph*{Purpose}
+
+This is called when a task is being removed from scheduling.
+
+\subsubsection{wake\_up}
+
+\paragraph*{Purpose}
+
+Called when a task is woken up, this method should put the task on the runqueue
+(or do the scheduler-specific equivalent action).
+
+\paragraph*{Call environment}
+
+The generic layer guarantees that the task is already in state
+RUNNING.
+
+\subsubsection{do\_block}
+
+\paragraph*{Purpose}
+
+This function is called when a task is blocked.  This function should
+not remove the task from the runqueue.
+
+\paragraph*{Call environment}
+
+The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to
+TASK\_INTERRUPTIBLE on entry to this method.
+
+\subsubsection{do\_schedule}
+
+This method must be implemented.
+
+\paragraph*{Purpose}
+
+The method is called each time a new task must be chosen for scheduling on the
+current CPU.  The current time as passed as the single argument (the current
+task can be found using the {\tt current} variable).
+
+This method should select the next task to run on this CPU and set it's minimum
+time to run as well as returning the data described below.
+
+This method should also take the appropriate action if the previous
+task has blocked, e.g. removing it from the runqueue.
+
+\paragraph*{Call environment}
+
+The other fields in the {\tt task\_struct} are updated by the generic layer,
+which also performs all Xen-specific tasks and performs the actual task switch
+(unless the previous task has been chosen again).
+
+This method is called with the {\tt schedule\_lock} held for the current CPU
+and with interrupts disabled.
+
+\paragraph*{Return values}
+
+Must return a {\tt struct task\_slice} describing what task to run and how long
+for (at maximum).
+
+\subsubsection{control}
+
+\paragraph*{Purpose}
+
+This method is called for global scheduler control operations.  It takes a
+pointer to a {\tt struct sched\_ctl\_cmd}, from which it should select the
+appropriate command data.
+
+\paragraph*{Call environment}
+
+The generic layer guarantees that when this method is called, the caller was
+using the same control interface version and that the caller selected the
+correct scheduler ID, hence the scheduler's implementation does not need to
+sanity-check these parts of the call.
+
+\paragraph*{Return values}
+
+This function should return the value to be passed back to user space, hence it
+should either be 0 or an appropriate errno value.
+
+\subsubsection{sched\_adjdom}
+
+\paragraph*{Purpose}
+
+This method is called to adjust the scheduling parameters of a particular
+domain.
+
+\paragraph*{Call environment}
+
+The generic layer guarantees that the caller has specified the correct
+control interface version and scheduler ID and that the supplied {\tt
+task\_struct} will not be deallocated during the call (hence it is not
+necessary to {\tt get\_task\_struct}).
+
+\paragraph*{Return values}
+
+This function should return the value to be passed back to user space, hence it
+should either be 0 or an appropriate errno value.
+
+\subsubsection{reschedule}
+
+\paragraph*{Purpose}
+
+This method is called to determine if a reschedule is required as a result of a
+particular task.
+
+\paragraph*{Call environment}
+The generic layer will cause a reschedule if the current domain is the idle
+task or it has exceeded its minimum time slice before a reschedule.  The
+generic layer guarantees that the task passed is not currently running but is
+on the runqueue.
+
+\paragraph*{Return values}
+
+Should return a mask of CPUs to cause a reschedule on.
+
+\subsubsection{dump\_settings}
+
+\paragraph*{Purpose}
+
+If implemented, this should dump any private global settings for this
+scheduler to the console.
+
+\paragraph*{Call environment}
+
+This function is called with interrupts enabled.
+
+\subsubsection{dump\_cpu\_state}
+
+\paragraph*{Purpose}
+
+This method should dump any private settings for the specified CPU.
+
+\paragraph*{Call environment}
+
+This function is called with interrupts disabled and the {\tt schedule\_lock}
+for the specified CPU held.
+
+\subsubsection{dump\_runq\_el}
+
+\paragraph*{Purpose}
+
+This method should dump any private settings for the specified task.
+
+\paragraph*{Call environment}
+
+This function is called with interrupts disabled and the {\tt schedule\_lock}
+for the task's CPU held.
 
 \chapter{Debugging}
 
diff --git a/tools/xc/lib/xc.h b/tools/xc/lib/xc.h
index fd1494dc13..aba2906842 100644
--- a/tools/xc/lib/xc.h
+++ b/tools/xc/lib/xc.h
@@ -81,6 +81,13 @@ int xc_bvtsched_domain_set(int xc_handle,
                            unsigned long warpl,
                            unsigned long warpu);
 
+int xc_atropos_domain_set(int xc_handle,
+			  u64 domid,
+			  int xtratime);
+
+int xc_rrobin_global_set(int xc_handle,
+			 u64 slice);
+
 typedef struct {
     unsigned long credit_bytes;
     unsigned long credit_usec;
diff --git a/tools/xc/lib/xc_atropos.c b/tools/xc/lib/xc_atropos.c
new file mode 100644
index 0000000000..b9ee719b0f
--- /dev/null
+++ b/tools/xc/lib/xc_atropos.c
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * xc_atropos.c
+ * 
+ * API for manipulating parameters of the Atropos scheduler.
+ * 
+ * by Mark Williamson, Copyright (c) 2004 Intel Research Cambridge.
+ */
+
+#include "xc_private.h"
+
+int xc_atropos_global_set(int xc_handle,
+			  unsigned long ctx_allow)
+{
+    dom0_op_t op;
+    op.cmd = DOM0_SCHEDCTL;
+    op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
+    op.u.schedctl.sched_id = SCHED_BVT;
+
+    op.u.schedctl.u.bvt.ctx_allow = ctx_allow;
+    return do_dom0_op(xc_handle, &op);
+}
+
+int xc_atropos_domain_set(int xc_handle,
+			  u64 domid, int xtratime)
+{
+    dom0_op_t op;
+    op.cmd = DOM0_ADJUSTDOM;
+
+    op.u.adjustdom.domain  = (domid_t)domid;
+    op.u.adjustdom.if_ver = SCHED_CTL_IF_VER;
+    op.u.adjustdom.sched_id = SCHED_ATROPOS;
+
+    op.u.adjustdom.u.atropos.xtratime = xtratime;
+
+    printf("Doing dom0 op!\n");
+
+    return do_dom0_op(xc_handle, &op);
+}
diff --git a/tools/xc/lib/xc_bvtsched.c b/tools/xc/lib/xc_bvtsched.c
index 57554cd2e6..e38f9cf09e 100644
--- a/tools/xc/lib/xc_bvtsched.c
+++ b/tools/xc/lib/xc_bvtsched.c
@@ -12,8 +12,10 @@ int xc_bvtsched_global_set(int xc_handle,
                            unsigned long ctx_allow)
 {
     dom0_op_t op;
-    op.cmd = DOM0_BVTCTL;
-    op.u.bvtctl.ctx_allow = ctx_allow;
+    op.cmd = DOM0_SCHEDCTL;
+    op.u.schedctl.sched_id = SCHED_BVT;
+    op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
+    op.u.schedctl.u.bvt.ctx_allow = ctx_allow;
     return do_dom0_op(xc_handle, &op);
 }
 
@@ -25,11 +27,16 @@ int xc_bvtsched_domain_set(int xc_handle,
                            unsigned long warpu)
 {
     dom0_op_t op;
+    struct bvt_adjdom *adjptr = &op.u.adjustdom.u.bvt;
+
     op.cmd = DOM0_ADJUSTDOM;
+    op.u.adjustdom.sched_id = SCHED_BVT;
+    op.u.adjustdom.if_ver   = SCHED_CTL_IF_VER;
     op.u.adjustdom.domain  = (domid_t)domid;
-    op.u.adjustdom.mcu_adv = mcuadv;
-    op.u.adjustdom.warp    = warp;
-    op.u.adjustdom.warpl   = warpl;
-    op.u.adjustdom.warpu   = warpu;
+
+    adjptr->mcu_adv = mcuadv;
+    adjptr->warp    = warp;
+    adjptr->warpl   = warpl;
+    adjptr->warpu   = warpu;
     return do_dom0_op(xc_handle, &op);
 }
diff --git a/tools/xc/lib/xc_private.h b/tools/xc/lib/xc_private.h
index d5ce8947ea..859d3a7740 100644
--- a/tools/xc/lib/xc_private.h
+++ b/tools/xc/lib/xc_private.h
@@ -23,6 +23,7 @@
 #include <dom0_ops.h>
 #include <vbd.h>
 #include <event_channel.h>
+#include <sched-ctl.h>
 
 #define _PAGE_PRESENT   0x001
 #define _PAGE_RW        0x002
diff --git a/tools/xc/lib/xc_rrobin.c b/tools/xc/lib/xc_rrobin.c
new file mode 100644
index 0000000000..4d986cee83
--- /dev/null
+++ b/tools/xc/lib/xc_rrobin.c
@@ -0,0 +1,20 @@
+/******************************************************************************
+ * xc_rrobin.c
+ * 
+ * API for manipulating parameters of the Round Robin scheduler
+ * 
+ * by Mark Williamson, Copyright (c) 2004 Intel Research Cambridge.
+ */
+
+#include "xc_private.h"
+
+int xc_rrobin_global_set(int xc_handle, u64 slice)
+{
+    dom0_op_t op;
+    op.cmd = DOM0_SCHEDCTL;
+    op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
+    op.u.schedctl.sched_id = SCHED_RROBIN;
+
+    op.u.schedctl.u.rrobin.slice = slice;
+    return do_dom0_op(xc_handle, &op);
+}
diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c
index 8ed16ee505..cef2a046ef 100644
--- a/tools/xc/py/Xc.c
+++ b/tools/xc/py/Xc.c
@@ -290,10 +290,10 @@ static PyObject *pyxc_bvtsched_domain_set(PyObject *self,
     u64           dom;
     unsigned long mcuadv, warp, warpl, warpu;
 
-    static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl", 
+    static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl",
                                 "warpu", NULL };
 
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lllll", kwd_list, 
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lllll", kwd_list,
                                       &dom, &mcuadv, &warp, &warpl, &warpu) )
         return NULL;
 
@@ -862,6 +862,49 @@ static PyObject *pyxc_physinfo(PyObject *self,
                          "cpu_khz",     info.cpu_khz);
 }
 
+static PyObject *pyxc_atropos_domain_set(PyObject *self,
+                                         PyObject *args,
+                                         PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    PyObject *ret_obj;
+    int xtratime;
+    u64 domid;
+
+    static char *kwd_list[] = { "dom", "xtratime", NULL };
+    
+    if( !PyArg_ParseTupleAndKeywords(args, kwds, "Li", kwd_list, &domid,
+                                     &xtratime) )
+        return NULL;
+   
+    if ( xc_atropos_domain_set(xc->xc_handle, domid, xtratime) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_rrobin_global_set(PyObject *self,
+                                        PyObject *args,
+                                        PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    PyObject *ret_obj;
+    u64 slice;
+    
+    static char *kwd_list[] = { "slice", NULL };
+
+    if( !PyArg_ParseTupleAndKeywords(args, kwds, "L", kwd_list, &slice) )
+        return NULL;
+    
+    if ( xc_rrobin_global_set(xc->xc_handle, slice) != 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
+}
+
+
 static PyMethodDef pyxc_methods[] = {
     { "domain_create", 
       (PyCFunction)pyxc_domain_create, 
@@ -955,15 +998,15 @@ static PyMethodDef pyxc_methods[] = {
       " cmdline [str, n/a]: Kernel parameters, if any.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
-    { "bvtsched_global_set", 
-      (PyCFunction)pyxc_bvtsched_global_set, 
+    { "bvtsched_global_set",
+      (PyCFunction)pyxc_bvtsched_global_set,
       METH_VARARGS | METH_KEYWORDS, "\n"
       "Set global tuning parameters for Borrowed Virtual Time scheduler.\n"
       " ctx_allow [int]: Minimal guaranteed quantum (I think!).\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
-    { "bvtsched_domain_set", 
-      (PyCFunction)pyxc_bvtsched_domain_set, 
+    { "bvtsched_domain_set",
+      (PyCFunction)pyxc_bvtsched_domain_set,
       METH_VARARGS | METH_KEYWORDS, "\n"
       "Set per-domain tuning parameters for Borrowed Virtual Time scheduler.\n"
       " dom    [long]: Identifier of domain to be tuned.\n"
@@ -973,6 +1016,22 @@ static PyMethodDef pyxc_methods[] = {
       " warpu  [int]:  Internal BVT parameter.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
+    { "atropos_domain_set",
+      (PyCFunction)pyxc_atropos_domain_set,
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set the extra time flag for a domain when running with Atropos.\n"
+      " dom [long]: domain to set\n"
+      " xtratime [int]: boolean\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "rrobin_global_set",
+      (PyCFunction)pyxc_rrobin_global_set,
+      METH_KEYWORDS, "\n"
+      "Set Round Robin scheduler slice.\n"
+      " slice [long]: Round Robin scheduler slice\n"
+      "Returns: [int] 0 on success, throws an exception on failure\n"
+    },
+
     { "vif_scheduler_set", 
       (PyCFunction)pyxc_vif_scheduler_set, 
       METH_VARARGS | METH_KEYWORDS, "\n"
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c
index b39ead491c..1d69f35bf3 100644
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -18,6 +18,7 @@
 #include <asm/pdb.h>
 #include <xeno/trace.h>
 #include <xeno/console.h>
+#include <hypervisor-ifs/sched-ctl.h>
 
 extern unsigned int alloc_new_dom_mem(struct task_struct *, unsigned int);
 
@@ -196,22 +197,15 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
     }
     break;
 
-    case DOM0_BVTCTL:
+    case DOM0_SCHEDCTL:
     {
-        unsigned long  ctx_allow = op->u.bvtctl.ctx_allow;
-        ret = sched_bvtctl(ctx_allow);        
+        ret = sched_ctl(&op->u.schedctl);
     }
     break;
 
     case DOM0_ADJUSTDOM:
     {
-        domid_t        dom     = op->u.adjustdom.domain;
-        unsigned long  mcu_adv = op->u.adjustdom.mcu_adv;
-        unsigned long  warp    = op->u.adjustdom.warp;
-        unsigned long  warpl   = op->u.adjustdom.warpl;
-        unsigned long  warpu   = op->u.adjustdom.warpu;
-
-        ret = sched_adjdom(dom, mcu_adv, warp, warpl, warpu);
+        ret = sched_adjdom(&op->u.adjustdom);
     }
     break;
 
@@ -281,7 +275,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
             if ( (p->state == TASK_STOPPED) || (p->state == TASK_DYING) )
                 op->u.getdomaininfo.state = DOMSTATE_STOPPED;
             op->u.getdomaininfo.hyp_events  = p->hyp_events;
-            op->u.getdomaininfo.mcu_advance = p->mcu_advance;
+//            op->u.getdomaininfo.mcu_advance = p->mcu_advance;
             op->u.getdomaininfo.tot_pages   = p->tot_pages;
             op->u.getdomaininfo.cpu_time    = p->cpu_time;
             op->u.getdomaininfo.shared_info_frame = 
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 8921ee246d..e61f02a26a 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -43,7 +43,6 @@ struct task_struct *do_createdomain(domid_t dom_id, unsigned int cpu)
 
     if ( (p = alloc_task_struct()) == NULL )
         return NULL;
-    memset(p, 0, sizeof(*p));
 
     atomic_set(&p->refcnt, 1);
 
@@ -496,7 +495,7 @@ void release_task(struct task_struct *p)
     UNSHARE_PFN(virt_to_page(p->shared_info));
     free_all_dom_mem(p);
 
-    kmem_cache_free(task_struct_cachep, p);
+    free_task_struct(p);
 }
 
 
diff --git a/xen/common/kernel.c b/xen/common/kernel.c
index f99f3fac32..5f2f27728f 100644
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -71,6 +71,8 @@ int opt_watchdog=0;
 unsigned char opt_pdb[10] = "none";
 /* opt_tbuf_size: trace buffer size (in pages) */
 unsigned int opt_tbuf_size = 1;
+/* opt_sched: scheduler - default to Borrowed Virtual Time */
+char opt_sched[10] = "bvt";
 
 static struct {
     unsigned char *name;
@@ -91,6 +93,7 @@ static struct {
     { "watchdog",         OPT_BOOL, &opt_watchdog },
     { "pdb",              OPT_STR,  &opt_pdb },
     { "tbuf_size",        OPT_UINT, &opt_tbuf_size },
+    { "sched",            OPT_STR,  &opt_sched },
     { NULL,               0,        NULL     }
 };
 
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 32786fdfd1..75f1e38c56 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -86,7 +86,15 @@ static char *task_states[] =
     NULL,
     NULL,
     NULL,
-    "Dying     ", 
+    "Dying     ",
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    "Sched priv"
 }; 
 
 void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs) 
diff --git a/xen/common/sched_bvt.c b/xen/common/sched_bvt.c
new file mode 100644
index 0000000000..f473e3f760
--- /dev/null
+++ b/xen/common/sched_bvt.c
@@ -0,0 +1,427 @@
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
+ ****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004      - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ *        File: common/schedule.c
+ *      Author: Rolf Neugebauer & Keir Fraser
+ *              Updated for generic API by Mark Williamson
+ *
+ * Description: CPU scheduling
+ *              implements A Borrowed Virtual Time scheduler.
+ *              (see Duda & Cheriton SOSP'99)
+ */
+
+#include <xeno/config.h>
+#include <xeno/init.h>
+#include <xeno/lib.h>
+#include <xeno/sched.h>
+#include <xeno/delay.h>
+#include <xeno/event.h>
+#include <xeno/time.h>
+#include <xeno/ac_timer.h>
+#include <xeno/interrupt.h>
+#include <xeno/timer.h>
+#include <xeno/perfc.h>
+#include <xeno/sched-if.h>
+#include <xeno/slab.h>
+
+/* all per-domain BVT-specific scheduling info is stored here */
+struct bvt_dom_info
+{
+    unsigned long mcu_advance;      /* inverse of weight */
+    u32           avt;              /* actual virtual time */
+    u32           evt;              /* effective virtual time */
+    int           warpback;         /* warp?  */
+    long          warp;             /* virtual time warp */
+    long          warpl;            /* warp limit */
+    long          warpu;            /* unwarp time requirement */
+    s_time_t      warped;           /* time it ran warped last time */
+    s_time_t      uwarped;          /* time it ran unwarped last time */
+};
+
+struct bvt_cpu_info
+{
+    unsigned long svt; /* XXX check this is unsigned long! */
+};
+
+
+#define DOM_INF(p) 	((struct bvt_dom_info *)(p)->sched_priv)
+#define CPU_INF(cpu)  ((struct bvt_cpu_info *)(schedule_data[cpu]).sched_priv)
+#define CPU_SVT(cpu)  (CPU_INF(cpu)->svt)
+
+#define MCU            (s32)MICROSECS(100)    /* Minimum unit */
+#define MCU_ADVANCE    10                     /* default weight */
+#define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
+static s32 ctx_allow = (s32)MILLISECS(5);     /* context switch allowance */
+
+/* SLAB cache for struct bvt_dom_info objects */
+static kmem_cache_t *dom_info_cache;
+
+/*
+ * Calculate the effective virtual time for a domain. Take into account 
+ * warping limits
+ */
+static void __calc_evt(struct bvt_dom_info *inf)
+{
+    s_time_t now = NOW();
+
+    if ( inf->warpback ) 
+    {
+        if ( ((now - inf->warped) < inf->warpl) &&
+             ((now - inf->uwarped) > inf->warpu) )
+        {
+            /* allowed to warp */
+            inf->evt = inf->avt - inf->warp;
+        } 
+        else 
+        {
+            /* warped for too long -> unwarp */
+            inf->evt      = inf->avt;
+            inf->uwarped  = now;
+            inf->warpback = 0;
+        }
+    } 
+    else 
+    {
+        inf->evt = inf->avt;
+    }
+}
+
+/**
+ * bvt_alloc_task - allocate BVT private structures for a task
+ * @p:              task to allocate private structures for
+ *
+ * Returns non-zero on failure.
+ */
+int bvt_alloc_task(struct task_struct *p)
+{
+    DOM_INF(p)
+        = (struct bvt_dom_info *)kmem_cache_alloc(dom_info_cache,GFP_KERNEL);
+	
+	if ( DOM_INF(p) == NULL )
+        return -1;
+    
+    return 0;
+}
+
+/*
+ * Add and remove a domain
+ */
+void bvt_add_task(struct task_struct *p) 
+{
+    struct bvt_dom_info *inf = DOM_INF(p);
+
+    ASSERT(inf != NULL);
+    ASSERT(p   != NULL);
+
+    inf->mcu_advance = MCU_ADVANCE;
+
+    if ( p->domain == IDLE_DOMAIN_ID )
+    {
+        inf->avt = inf->evt = ~0U;
+    } 
+    else 
+    {
+        /* Set avt and evt to system virtual time. */
+        inf->avt         = CPU_SVT(p->processor);
+        inf->evt         = CPU_SVT(p->processor);
+        /* Set some default values here. */
+        inf->warpback    = 0;
+        inf->warp        = 0;
+        inf->warpl       = 0;
+        inf->warpu       = 0;
+    }
+
+    return;
+}
+
+/**
+ * bvt_free_task - free BVT private structures for a task
+ * @p:             task
+ */
+void bvt_free_task(struct task_struct *p)
+{
+    ASSERT( p->sched_priv != NULL );
+    kmem_cache_free( dom_info_cache, p->sched_priv );
+}
+
+
+void bvt_wake_up(struct task_struct *p)
+{
+    struct bvt_dom_info *inf = DOM_INF(p);
+
+    ASSERT(inf != NULL);
+    
+
+    /* set the BVT parameters */
+    if (inf->avt < CPU_SVT(p->processor))
+        inf->avt = CPU_SVT(p->processor);
+
+    /* deal with warping here */
+    inf->warpback  = 1;
+    inf->warped    = NOW();
+    __calc_evt(inf);
+    __add_to_runqueue_head(p);
+}
+
+/* 
+ * Block the currently-executing domain until a pertinent event occurs.
+ */
+static long bvt_do_block(struct task_struct *p)
+{
+    DOM_INF(p)->warpback = 0; 
+    return 0;
+}
+
+/* Control the scheduler. */
+int bvt_ctl(struct sched_ctl_cmd *cmd)
+{
+    struct bvt_ctl *params = &cmd->u.bvt;
+    
+    ctx_allow = params->ctx_allow;
+
+    return 0;
+}
+
+/* Adjust scheduling parameter for a given domain. */
+int bvt_adjdom(struct task_struct *p,
+               struct sched_adjdom_cmd *cmd)
+{
+    struct bvt_adjdom *params = &cmd->u.bvt;
+    unsigned long mcu_adv = params->mcu_adv,
+                    warp  = params->warp,
+                    warpl = params->warpl,
+                    warpu = params->warpu;
+    
+    struct bvt_dom_info *inf = DOM_INF(p);
+
+    /* Sanity -- this can avoid divide-by-zero. */
+    if ( mcu_adv == 0 )
+        return -EINVAL;
+
+    spin_lock_irq(&schedule_lock[p->processor]);   
+    inf->mcu_advance = mcu_adv;
+    inf->warp = warp;
+    inf->warpl = warpl;
+    inf->warpu = warpu;
+    spin_unlock_irq(&schedule_lock[p->processor]); 
+
+    return 0;
+}
+
+
+/* 
+ * The main function
+ * - deschedule the current domain.
+ * - pick a new domain.
+ *   i.e., the domain with lowest EVT.
+ *   The runqueue should be ordered by EVT so that is easy.
+ */
+static task_slice_t bvt_do_schedule(s_time_t now)
+{
+    struct task_struct *prev = current, *next = NULL, *next_prime, *p;
+    struct list_head   *tmp;
+    int                 cpu = prev->processor;
+    s32                 r_time;     /* time for new dom to run */
+    s32                 ranfor;     /* assume we never run longer than 2.1s! */
+    s32                 mcus;
+    u32                 next_evt, next_prime_evt, min_avt;
+    struct bvt_dom_info *prev_inf       = DOM_INF(prev),
+                        *p_inf          = NULL,
+                        *next_inf       = NULL,
+                        *next_prime_inf = NULL;
+    task_slice_t        ret;
+
+    ASSERT(prev->sched_priv != NULL);
+    ASSERT(prev_inf != NULL);
+
+    if ( likely(!is_idle_task(prev)) ) 
+    {
+        ranfor = (s32)(now - prev->lastschd);
+        /* Calculate mcu and update avt. */
+        mcus = (ranfor + MCU - 1) / MCU;
+        prev_inf->avt += mcus * prev_inf->mcu_advance;
+        
+        __calc_evt(prev_inf);
+        
+        __del_from_runqueue(prev);
+        
+        if ( likely(prev->state == TASK_RUNNING) )
+            __add_to_runqueue_tail(prev);
+    }
+
+    /* We should at least have the idle task */
+    ASSERT(!list_empty(&schedule_data[cpu].runqueue));
+
+    /*
+     * scan through the run queue and pick the task with the lowest evt
+     * *and* the task the second lowest evt.
+     * this code is O(n) but we expect n to be small.
+     */
+    next       = schedule_data[cpu].idle;
+    next_prime = NULL;
+
+    next_evt       = ~0U;
+    next_prime_evt = ~0U;
+    min_avt        = ~0U;
+
+    list_for_each ( tmp, &schedule_data[cpu].runqueue )
+    {
+        p     = list_entry(tmp, struct task_struct, run_list);
+        p_inf = DOM_INF(p);
+
+        if ( p_inf->evt < next_evt )
+        {
+            next_prime     = next;
+            next_prime_evt = next_evt;
+            next = p;
+            next_evt = p_inf->evt;
+        } 
+        else if ( next_prime_evt == ~0U )
+        {
+            next_prime_evt = p_inf->evt;
+            next_prime     = p;
+        } 
+        else if ( p_inf->evt < next_prime_evt )
+        {
+            next_prime_evt = p_inf->evt;
+            next_prime     = p;
+        }
+
+        /* Determine system virtual time. */
+        if ( p_inf->avt < min_avt )
+            min_avt = p_inf->avt;
+    }
+
+    /* Update system virtual time. */
+    if ( min_avt != ~0U )
+        CPU_SVT(cpu) = min_avt;
+
+    /* check for virtual time overrun on this cpu */
+    if ( CPU_SVT(cpu) >= 0xf0000000 )
+    {
+        u_long t_flags; 
+        write_lock_irqsave(&tasklist_lock, t_flags); 
+        for_each_domain ( p )
+        {
+            if ( p->processor == cpu )
+            {
+                p_inf->evt -= 0xe0000000;
+                p_inf->avt -= 0xe0000000;
+            }
+        } 
+        write_unlock_irqrestore(&tasklist_lock, t_flags); 
+        CPU_SVT(cpu) -= 0xe0000000;
+    }
+
+    /* work out time for next run through scheduler */
+    if ( is_idle_task(next) ) 
+    {
+        r_time = ctx_allow;
+        goto sched_done;
+    }
+
+    if ( (next_prime == NULL) || is_idle_task(next_prime) )
+    {
+        /* We have only one runnable task besides the idle task. */
+        r_time = 10 * ctx_allow;     /* RN: random constant */
+        goto sched_done;
+    }
+
+    next_prime_inf = DOM_INF(next_prime);
+    next_inf       = DOM_INF(next);
+
+    /*
+     * If we are here then we have two runnable tasks.
+     * Work out how long 'next' can run till its evt is greater than
+     * 'next_prime's evt. Take context switch allowance into account.
+     */
+    ASSERT(next_prime_inf->evt >= next_inf->evt);
+    
+    r_time = ((next_prime_inf->evt - next_inf->evt)/next_inf->mcu_advance)
+        + ctx_allow;
+
+    ASSERT(r_time >= ctx_allow);
+
+ sched_done:
+    next->min_slice = ctx_allow;
+    ret.task = next;
+    ret.time = r_time;
+
+    return ret;
+}
+
+
+static void bvt_dump_runq_el(struct task_struct *p)
+{
+    struct bvt_dom_info *inf = DOM_INF(p);
+    
+    printk("mcua=0x%04lX ev=0x%08X av=0x%08X ",
+           inf->mcu_advance, inf->evt, inf->avt);
+}
+
+static void bvt_dump_settings(void)
+{
+    printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
+}
+
+static void bvt_dump_cpu_state(int i)
+{
+    printk("svt=0x%08lX ", CPU_SVT(i));
+}
+
+
+/* Initialise the data structures. */
+int bvt_init_scheduler()
+{
+    int i;
+
+    for ( i = 0; i < NR_CPUS; i++ )
+    {
+        CPU_INF(i) = kmalloc(sizeof(struct bvt_cpu_info), GFP_KERNEL);
+
+        if ( CPU_INF(i) == NULL )
+        {
+            printk("Failed to allocate BVT scheduler private per-CPU memory!\n");
+            return -1;
+        }
+
+        CPU_SVT(i) = 0; /* XXX do I really need to do this? */
+    }
+
+    dom_info_cache = kmem_cache_create("BVT dom info",
+                                       sizeof(struct bvt_dom_info),
+                                       0, 0, NULL, NULL);
+
+    if ( dom_info_cache == NULL )
+    {
+        printk("BVT: Failed to allocate domain info SLAB cache");
+        return -1;
+    }
+
+    return 0;
+}
+
+
+struct scheduler sched_bvt_def = {
+    .name     = "Borrowed Virtual Time",
+    .opt_name = "bvt",
+    .sched_id = SCHED_BVT,
+    
+    .init_scheduler = bvt_init_scheduler,
+    .alloc_task     = bvt_alloc_task,
+    .add_task       = bvt_add_task,
+    .free_task      = bvt_free_task,
+    .wake_up        = bvt_wake_up,
+    .do_block       = bvt_do_block,
+    .do_schedule    = bvt_do_schedule,
+    .control        = bvt_ctl,
+    .adjdom         = bvt_adjdom,
+    .dump_settings  = bvt_dump_settings,
+    .dump_cpu_state = bvt_dump_cpu_state,
+    .dump_runq_el   = bvt_dump_runq_el,
+};
+
diff --git a/xen/common/sched_rrobin.c b/xen/common/sched_rrobin.c
new file mode 100644
index 0000000000..ef4db60066
--- /dev/null
+++ b/xen/common/sched_rrobin.c
@@ -0,0 +1,56 @@
+/****************************************************************************
+ * Very stupid Round Robin Scheduler for Xen
+ *
+ * by Mark Williamson (C) 2004 Intel Research Cambridge
+ */
+
+#include <xeno/sched.h>
+#include <xeno/sched-if.h>
+#include <hypervisor-ifs/sched-ctl.h>
+#include <xeno/ac_timer.h>
+#include <xeno/time.h>
+
+static s_time_t rr_slice = MILLISECS(10);
+
+static task_slice_t rr_do_schedule(s_time_t now)
+{
+    struct task_struct *prev = current;
+    int cpu = current->processor;
+    task_slice_t ret;
+ 
+    __del_from_runqueue(prev);
+    
+    if ( prev->state == TASK_RUNNING )
+      __add_to_runqueue_tail(prev);
+    
+    ret.task = list_entry(schedule_data[cpu].runqueue.next,
+                    struct task_struct, run_list);
+
+    ret.time = rr_slice;
+
+    return ret;
+}
+
+static int rr_ctl(struct sched_ctl_cmd *cmd)
+{
+    rr_slice = cmd->u.rrobin.slice;
+    return 0;
+}
+
+static void rr_dump_settings()
+{
+    printk("rr_slice = %llu ", rr_slice);
+}
+
+struct scheduler sched_rrobin_def = {
+    .name     = "Stupid Round Robin Scheduler",
+    .opt_name = "rrobin",
+    .sched_id = SCHED_RROBIN,
+
+    .wake_up        = __add_to_runqueue_head,
+    .do_schedule    = rr_do_schedule,
+    .control        = rr_ctl,
+    .dump_settings  = rr_dump_settings,
+};
+
+
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 5487f15e5a..054123077c 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -2,14 +2,16 @@
  ****************************************************************************
  * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
  * (C) 2002-2003 University of Cambridge
+ * (C) 2004      - Mark Williamson - Intel Research Cambridge
  ****************************************************************************
  *
  *        File: common/schedule.c
  *      Author: Rolf Neugebauer & Keir Fraser
+ *              Updated for generic API by Mark Williamson
  * 
- * Description: CPU scheduling
- *              implements A Borrowed Virtual Time scheduler.
- *              (see Duda & Cheriton SOSP'99)
+ * Description: Generic CPU scheduling code
+ *              implements support functionality for the Xen scheduler API.
+ *
  */
 
 #include <xeno/config.h>
@@ -23,6 +25,9 @@
 #include <xeno/interrupt.h>
 #include <xeno/timer.h>
 #include <xeno/perfc.h>
+#include <xeno/sched-if.h>
+#include <hypervisor-ifs/sched-ctl.h>
+#include <xeno/trace.h>
 
 /*#define WAKEUP_HISTO*/
 /*#define BLOCKTIME_HISTO*/
@@ -33,23 +38,54 @@
 #define BUCKETS 200
 #endif
 
-#define MCU            (s32)MICROSECS(100)    /* Minimum unit */
-#define MCU_ADVANCE    10                     /* default weight */
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
-static s32 ctx_allow = (s32)MILLISECS(5);     /* context switch allowance */
 
-typedef struct schedule_data_st
-{
-    struct list_head    runqueue;       /* runqueue */
-    struct task_struct *curr;           /* current task */
-    struct task_struct *idle;           /* idle task for this cpu */
-    u32                 svt;            /* system virtual time. per CPU??? */
-    struct ac_timer     s_timer;        /* scheduling timer  */
-#ifdef BUCKETS
-    u32                 hist[BUCKETS];  /* for scheduler latency histogram */
-#endif
-} __cacheline_aligned schedule_data_t;
-static schedule_data_t schedule_data[NR_CPUS];
+/* XXX MAW pull trace-related #defines out of here and into an auto-generated
+ * header file later on! */
+#define TRC_SCHED_DOM_ADD             0x00010000
+#define TRC_SCHED_DOM_REM             0x00010001
+#define TRC_SCHED_WAKE                0x00010002
+#define TRC_SCHED_BLOCK               0x00010003
+#define TRC_SCHED_YIELD               0x00010004
+#define TRC_SCHED_SET_TIMER           0x00010005
+#define TRC_SCHED_CTL                 0x00010006
+#define TRC_SCHED_ADJDOM              0x00010007
+#define TRC_SCHED_RESCHED             0x00010008
+#define TRC_SCHED_SWITCH              0x00010009
+#define TRC_SCHED_S_TIMER_FN          0x0001000A
+#define TRC_SCHED_T_TIMER_FN          0x0001000B
+#define TRC_SCHED_DOM_TIMER_FN        0x0001000C
+#define TRC_SCHED_FALLBACK_TIMER_FN   0x0001000D
+
+#define _HIGH32(_x) (_x >> 32)
+#define _LOW32(_x)  ((u32)_x )
+
+/* Various timer handlers. */
+static void s_timer_fn(unsigned long unused);
+static void t_timer_fn(unsigned long unused);
+static void dom_timer_fn(unsigned long data);
+static void fallback_timer_fn(unsigned long unused);
+
+/* this is global for now so that private implementations can reach it */
+schedule_data_t schedule_data[NR_CPUS];
+
+/* XXX would be nice if the schedulers array could get populated
+ * automagically without having to hack the code in here         */
+extern struct scheduler sched_bvt_def, sched_rrobin_def;
+static struct scheduler *schedulers[] = { &sched_bvt_def,
+                                          &sched_rrobin_def,
+                                          NULL};
+
+/* scheduler ops for the current scheduler */
+static struct scheduler ops;
+
+/* for scheduler functions that return void             */
+#define SCHED_FN_VOID(fn, ...) do { if ( ops.fn ) ops.fn(__VA_ARGS__); } \
+                               while (0)
+
+/* for scheduler functions that return a numeric value  */
+#define SCHED_FN_RET(fn, ...)                             \
+         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) : 0 )
 
 spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
 
@@ -62,110 +98,78 @@ static struct ac_timer t_timer[NR_CPUS];
  */
 static struct ac_timer fallback_timer[NR_CPUS];
 
-/* Various timer handlers. */
-static void s_timer_fn(unsigned long unused);
-static void t_timer_fn(unsigned long unused);
-static void dom_timer_fn(unsigned long data);
-static void fallback_timer_fn(unsigned long unused);
-
-/*
- * Wrappers for run-queue management. Must be called with the schedule_lock
- * held.
- */
-static inline void __add_to_runqueue_head(struct task_struct * p)
-{    
-    list_add(&p->run_list, &schedule_data[p->processor].runqueue);
-}
+extern kmem_cache_t *task_struct_cachep;
 
-static inline void __add_to_runqueue_tail(struct task_struct * p)
+void free_task_struct(struct task_struct *p)
 {
-    list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
+    SCHED_FN_VOID(free_task, p);
+    kmem_cache_free(task_struct_cachep, p);
 }
 
-static inline void __del_from_runqueue(struct task_struct * p)
+/**
+ * alloc_task_struct - allocate a new task_struct and sched private structures
+ */
+struct task_struct *alloc_task_struct(void)
 {
-    list_del(&p->run_list);
-    p->run_list.next = NULL;
-}
+    struct task_struct *p;
 
-static inline int __task_on_runqueue(struct task_struct *p)
-{
-    return p->run_list.next != NULL;
-}
+    p=((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL));
 
-#define next_domain(p) \\
-        list_entry((p)->run_list.next, struct task_struct, run_list)
+    if ( p == NULL )
+        return NULL;
 
-/*
- * Calculate the effective virtual time for a domain. Take into account 
- * warping limits
- */
-static void __calc_evt(struct task_struct *p)
-{
-    s_time_t now = NOW();
-    if ( p->warpback ) 
-    {
-        if ( ((now - p->warped) < p->warpl) &&
-             ((now - p->uwarped) > p->warpu) )
-        {
-            /* allowed to warp */
-            p->evt = p->avt - p->warp;
-        } 
-        else 
-        {
-            /* warped for too long -> unwarp */
-            p->evt      = p->avt;
-            p->uwarped  = now;
-            p->warpback = 0;
-        }
-    } 
-    else 
+    memset(p, 0, sizeof(*p));    
+
+    if ( SCHED_FN_RET(alloc_task, p) < 0)
     {
-        p->evt = p->avt;
+        kmem_cache_free(task_struct_cachep, p);
+        return NULL;
     }
+    
+    return p;
 }
 
-
 /*
  * Add and remove a domain
  */
 void sched_add_domain(struct task_struct *p) 
 {
     p->state       = TASK_STOPPED;
-    p->mcu_advance = MCU_ADVANCE;
 
-    if ( p->domain == IDLE_DOMAIN_ID )
-    {
-        p->avt = p->evt = ~0U;
-        schedule_data[p->processor].idle = p;
-    } 
-    else 
+    if( p->domain != IDLE_DOMAIN_ID )
     {
-        /* Set avt end evt to system virtual time. */
-        p->avt         = schedule_data[p->processor].svt;
-        p->evt         = schedule_data[p->processor].svt;
-        /* Set some default values here. */
-        p->warpback    = 0;
-        p->warp        = 0;
-        p->warpl       = 0;
-        p->warpu       = 0;
-
         /* Initialise the per-domain timer. */
         init_ac_timer(&p->timer);
         p->timer.cpu      =  p->processor;
         p->timer.data     = (unsigned long)p;
         p->timer.function = &dom_timer_fn;
-
     }
+    else
+    {
+        schedule_data[p->processor].idle = p;
+    }
+
+    SCHED_FN_VOID(add_task, p);
+
+    TRACE_3D(TRC_SCHED_DOM_ADD, _HIGH32(p->domain), _LOW32(p->domain), p);
 }
 
+/* XXX race condition here?   we could both add and remove a domain at once, in
+ * theory.  ick! */
+/* XXX is the task already removed from the runlist at this point? */
 int sched_rem_domain(struct task_struct *p) 
 {
     int x, y = p->state;
     do {
         if ( (x = y) == TASK_DYING ) return 0;
     } while ( (y = cmpxchg(&p->state, x, TASK_DYING)) != x );
+
     rem_ac_timer(&p->timer);
+
+    SCHED_FN_VOID(rem_task, p);
+
+    TRACE_3D(TRC_SCHED_DOM_REM, _HIGH32(p->domain), _LOW32(p->domain), p);
+
     return 1;
 }
 
@@ -174,6 +178,11 @@ void init_idle_task(void)
 {
     unsigned long flags;
     struct task_struct *p = current;
+
+    if ( SCHED_FN_RET (alloc_task, p) < 0)
+		panic("Failed to allocate scheduler private data for idle task");
+    SCHED_FN_VOID(add_task, p);
+
     spin_lock_irqsave(&schedule_lock[p->processor], flags);
     p->has_cpu = 1;
     p->state = TASK_RUNNING;
@@ -182,31 +191,25 @@ void init_idle_task(void)
     spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
 }
 
-
 void __wake_up(struct task_struct *p)
 {
+    TRACE_3D(TRC_SCHED_WAKE, _HIGH32(p->domain), _LOW32(p->domain), p);
+
     ASSERT(p->state != TASK_DYING);
 
     if ( unlikely(__task_on_runqueue(p)) )
         return;
 
     p->state = TASK_RUNNING;
-    __add_to_runqueue_head(p);
-
-    /* set the BVT parameters */
-    if (p->avt < schedule_data[p->processor].svt)
-        p->avt = schedule_data[p->processor].svt;
 
-    /* deal with warping here */
-    p->warpback  = 1;
-    p->warped    = NOW();
-    __calc_evt(p);
+    SCHED_FN_VOID(wake_up, p);
 
 #ifdef WAKEUP_HISTO
     p->wokenup = NOW();
 #endif
 }
 
+
 void wake_up(struct task_struct *p)
 {
     unsigned long flags;
@@ -220,9 +223,10 @@ void wake_up(struct task_struct *p)
  */
 static long do_block(void)
 {
+    ASSERT(current->domain != IDLE_DOMAIN_ID);
     set_bit(EVENTS_MASTER_ENABLE_BIT, &current->shared_info->events_mask);
     current->state = TASK_INTERRUPTIBLE;
-    current->warpback = 0; 
+    TRACE_2D(TRC_SCHED_BLOCK, current->domain, current);
     __enter_scheduler();
     return 0;
 }
@@ -232,6 +236,7 @@ static long do_block(void)
  */
 static long do_yield(void)
 {
+    TRACE_2D(TRC_SCHED_YIELD, current->domain, current);
     __enter_scheduler();
     return 0;
 }
@@ -295,37 +300,56 @@ long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo)
         add_ac_timer(&p->timer);
     }
 
+    TRACE_5D(TRC_SCHED_SET_TIMER, _HIGH32(p->domain), _LOW32(p->domain),
+             p, timeout_hi, timeout_lo);
+
     return 0;
 }
 
 
-/* Control the scheduler. */
-long sched_bvtctl(unsigned long c_allow)
+/**
+ * sched_ctl - dispatch a scheduler control operation
+ * @cmd:       the command passed in the dom0 op
+ *
+ * Given a generic scheduler control operation, call the control function for
+ * the scheduler in use, passing the appropriate control information from the
+ * union supplied.
+ */
+long sched_ctl(struct sched_ctl_cmd *cmd)
 {
-    ctx_allow = c_allow;
-    return 0;
+    TRACE_0D(TRC_SCHED_CTL);
+
+    if ( cmd->if_ver != SCHED_CTL_IF_VER )
+        return -EACCES;
+
+    if ( cmd->sched_id != ops.sched_id )
+        return -EINVAL;
+
+    return SCHED_FN_RET(control, cmd);
 }
 
+
 /* Adjust scheduling parameter for a given domain. */
-long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp, 
-                 unsigned long warpl, unsigned long warpu)
+long sched_adjdom(struct sched_adjdom_cmd *cmd)
 {
-    struct task_struct *p;
+    struct task_struct *p;    
+    
+    if ( cmd->if_ver != SCHED_CTL_IF_VER )
+        return -EACCES;
 
-    /* Sanity -- this can avoid divide-by-zero. */
-    if ( mcu_adv == 0 )
+    if ( cmd->sched_id != ops.sched_id )
         return -EINVAL;
 
-    p = find_domain_by_id(dom);
-    if ( p == NULL ) 
+    p = find_domain_by_id(cmd->domain);
+
+    if( p == NULL )
         return -ESRCH;
 
-    spin_lock_irq(&schedule_lock[p->processor]);   
-    p->mcu_advance = mcu_adv;
-    spin_unlock_irq(&schedule_lock[p->processor]); 
+    TRACE_2D(TRC_SCHED_ADJDOM, _HIGH32(p->domain), _LOW32(p->domain));
 
-    put_task_struct(p);
+    SCHED_FN_VOID(adjdom, p, cmd);
 
+    put_task_struct(p); 
     return 0;
 }
 
@@ -339,17 +363,19 @@ long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp,
  */
 unsigned long __reschedule(struct task_struct *p)
 {
-    int cpu = p->processor;
+       int cpu = p->processor;
     struct task_struct *curr;
     s_time_t now, min_time;
 
+    TRACE_3D(TRC_SCHED_RESCHED, _HIGH32(p->domain), _LOW32(p->domain), p);
+
     if ( unlikely(p->has_cpu || !__task_on_runqueue(p)) )
         return 0;
 
     now = NOW();
     curr = schedule_data[cpu].curr;
     /* domain should run at least for ctx_allow */
-    min_time = curr->lastschd + ctx_allow;
+    min_time = curr->lastschd + curr->min_slice;
 
     if ( is_idle_task(curr) || (min_time <= now) )
     {
@@ -362,161 +388,67 @@ unsigned long __reschedule(struct task_struct *p)
     if ( schedule_data[cpu].s_timer.expires > min_time + TIME_SLOP )
         mod_ac_timer(&schedule_data[cpu].s_timer, min_time);
 
-    return 0;
+    return SCHED_FN_RET(reschedule, p);
 }
 
-
 void reschedule(struct task_struct *p)
 {
     unsigned long flags, cpu_mask;
+
     spin_lock_irqsave(&schedule_lock[p->processor], flags);
     cpu_mask = __reschedule(p);
+
     spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
     hyp_event_notify(cpu_mask);
 }
 
-
 /* 
  * The main function
- * - deschedule the current domain.
- * - pick a new domain.
- *   i.e., the domain with lowest EVT.
- *   The runqueue should be ordered by EVT so that is easy.
+ * - deschedule the current domain (scheduler independent).
+ * - pick a new domain (scheduler dependent).
  */
 asmlinkage void __enter_scheduler(void)
 {
-    struct task_struct *prev = current, *next = NULL, *next_prime, *p;
-    struct list_head   *tmp;
+    struct task_struct *prev = current, *next = NULL;
     int                 cpu = prev->processor;
     s_time_t            now;
+    task_slice_t        next_slice;
     s32                 r_time;     /* time for new dom to run */
-    s32                 ranfor;     /* assume we never run longer than 2.1s! */
-    s32                 mcus;
-    u32                 next_evt, next_prime_evt, min_avt;
 
     perfc_incrc(sched_run);
 
+    clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
+
     spin_lock_irq(&schedule_lock[cpu]);
 
     now = NOW();
 
     rem_ac_timer(&schedule_data[cpu].s_timer);
-
+    
     ASSERT(!in_interrupt());
     ASSERT(__task_on_runqueue(prev));
     ASSERT(prev->state != TASK_UNINTERRUPTIBLE);
+    ASSERT(prev != NULL);
 
-    if ( likely(!is_idle_task(prev)) ) 
+    if ( prev->state == TASK_INTERRUPTIBLE )
     {
-        ranfor = (s32)(now - prev->lastschd);
-        prev->cpu_time += ranfor;
-    
-        /* Calculate mcu and update avt. */
-        mcus = (ranfor + MCU - 1) / MCU;
-        prev->avt += mcus * prev->mcu_advance;
-        
-        __calc_evt(prev);
-        
-        __del_from_runqueue(prev);
-        
-        if ( likely(prev->state == TASK_RUNNING) ||
-             unlikely((prev->state == TASK_INTERRUPTIBLE) && 
-                      signal_pending(prev)) )
-        {
+        /* this check is needed to avoid a race condition */
+        if ( signal_pending(prev) )
             prev->state = TASK_RUNNING;
-            __add_to_runqueue_tail(prev);
-        }
+        else
+            SCHED_FN_VOID(do_block, prev);
     }
 
-    clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
-
-    /* We should at least have the idle task */
-    ASSERT(!list_empty(&schedule_data[cpu].runqueue));
-
-    /*
-     * scan through the run queue and pick the task with the lowest evt
-     * *and* the task the second lowest evt.
-     * this code is O(n) but we expect n to be small.
-     */
-    next       = schedule_data[cpu].idle;
-    next_prime = NULL;
-
-    next_evt       = ~0U;
-    next_prime_evt = ~0U;
-    min_avt        = ~0U;
-
-    list_for_each ( tmp, &schedule_data[cpu].runqueue )
-    {
-        p = list_entry(tmp, struct task_struct, run_list);
-        if ( p->evt < next_evt )
-        {
-            next_prime     = next;
-            next_prime_evt = next_evt;
-            next = p;
-            next_evt = p->evt;
-        } 
-        else if ( next_prime_evt == ~0U )
-        {
-            next_prime_evt = p->evt;
-            next_prime     = p;
-        } 
-        else if ( p->evt < next_prime_evt )
-        {
-            next_prime_evt = p->evt;
-            next_prime     = p;
-        }
-
-        /* Determine system virtual time. */
-        if ( p->avt < min_avt )
-            min_avt = p->avt;
-    }
+    /* get policy-specific decision on scheduling... */
+    next_slice = ops.do_schedule(now);
 
-    /* Update system virtual time. */
-    if ( min_avt != ~0U )
-        schedule_data[cpu].svt = min_avt;
+    r_time = next_slice.time;
+    next   = next_slice.task;
 
-    /* check for virtual time overrun on this cpu */
-    if ( schedule_data[cpu].svt >= 0xf0000000 )
-    {
-        u_long t_flags; 
-        write_lock_irqsave(&tasklist_lock, t_flags); 
-        for_each_domain ( p )
-        {
-            if ( p->processor == cpu )
-            {
-                p->evt -= 0xe0000000;
-                p->avt -= 0xe0000000;
-            }
-        } 
-        write_unlock_irqrestore(&tasklist_lock, t_flags); 
-        schedule_data[cpu].svt -= 0xe0000000;
-    }
-
-    /* work out time for next run through scheduler */
-    if ( is_idle_task(next) ) 
-    {
-        r_time = ctx_allow;
-        goto sched_done;
-    }
-
-    if ( (next_prime == NULL) || is_idle_task(next_prime) )
-    {
-        /* We have only one runnable task besides the idle task. */
-        r_time = 10 * ctx_allow;     /* RN: random constant */
-        goto sched_done;
-    }
-
-    /*
-     * If we are here then we have two runnable tasks.
-     * Work out how long 'next' can run till its evt is greater than
-     * 'next_prime's evt. Take context switch allowance into account.
-     */
-    ASSERT(next_prime->evt >= next->evt);
-    
-    r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
+    if ( likely(!is_idle_task(prev)) ) 
+        prev->cpu_time += (now - prev->lastschd);
 
- sched_done:
-    ASSERT(r_time >= ctx_allow);
+    /* now, switch to the new task... */
 
     prev->has_cpu = 0;
     next->has_cpu = 1;
@@ -537,7 +469,7 @@ asmlinkage void __enter_scheduler(void)
 
     if ( unlikely(prev == next) )
         return;
-
+    
     perfc_incrc(sched_ctx);
 
 #if defined(WAKEUP_HISTO)
@@ -558,6 +490,10 @@ asmlinkage void __enter_scheduler(void)
     }
 #endif
 
+    TRACE_2D(TRC_SCHED_SWITCH, next->domain, next);
+
+    ASSERT(next->processor == current->processor);
+
     switch_to(prev, next);
     
     if ( unlikely(prev->state == TASK_DYING) ) 
@@ -591,6 +527,8 @@ int idle_cpu(int cpu)
 /* The scheduler timer: force a run through the scheduler*/
 static void s_timer_fn(unsigned long unused)
 {
+    TRACE_0D(TRC_SCHED_S_TIMER_FN);
+    
     set_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events);
     perfc_incrc(sched_irq);
 }
@@ -600,6 +538,8 @@ static void t_timer_fn(unsigned long unused)
 {
     struct task_struct *p = current;
 
+    TRACE_0D(TRC_SCHED_T_TIMER_FN);
+
     if ( !is_idle_task(p) ) 
         set_bit(_EVENT_TIMER, &p->shared_info->events);
 
@@ -613,6 +553,8 @@ static void dom_timer_fn(unsigned long data)
     unsigned long cpu_mask = 0;
     struct task_struct *p = (struct task_struct *)data;
 
+    TRACE_0D(TRC_SCHED_DOM_TIMER_FN);
+
     cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
     guest_event_notify(cpu_mask);
 }
@@ -623,6 +565,8 @@ static void fallback_timer_fn(unsigned long unused)
 {
     struct task_struct *p = current;
 
+    TRACE_0D(TRC_SCHED_FALLBACK_TIMER_FN);
+
     if ( !is_idle_task(p) )
         update_dom_time(p->shared_info);
 
@@ -660,6 +604,29 @@ void __init scheduler_init(void)
     }
 
     schedule_data[0].idle = &idle0_task;
+
+    extern char opt_sched[];
+
+    for ( i = 0; schedulers[i] != NULL; i++ )
+    {
+        ops = *schedulers[i]; /* fetch operations structure */
+
+        if(strcmp(ops.opt_name, opt_sched) == 0)
+            break;
+    }
+    
+    if ( schedulers[i] == NULL )
+        printk("Could not find scheduler: %s\n", opt_sched);
+
+    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
+    
+    if ( ops.do_schedule == NULL)
+        panic("Chosen scheduler has NULL do_schedule!");
+
+    if ( SCHED_FN_RET(init_scheduler) < 0 )
+        panic("Initialising scheduler failed!");
+
+    SCHED_FN_VOID(add_task, &idle0_task);
 }
 
 /*
@@ -698,12 +665,9 @@ static void dump_rqueue(struct list_head *queue, char *name)
             (unsigned long) queue->next, (unsigned long) queue->prev);
     list_for_each (list, queue) {
         p = list_entry(list, struct task_struct, run_list);
-        printk("%3d: %llu has=%c mcua=0x%04lX"
-               " ev=0x%08X av=0x%08X c=0x%X%08X\n",
-               loop++, p->domain,
-               p->has_cpu ? 'T':'F',
-               p->mcu_advance, p->evt, p->avt,
-               (u32)(p->cpu_time>>32), (u32)p->cpu_time);
+        printk("%3d: %llu has=%c ", loop++, p->domain, p->has_cpu ? 'T':'F');
+        SCHED_FN_VOID(dump_runq_el, p);
+        printk("c=0x%X%08X\n", (u32)(p->cpu_time>>32), (u32)p->cpu_time);
         printk("         l: %lx n: %lx  p: %lx\n",
                (unsigned long)list, (unsigned long)list->next,
                (unsigned long)list->prev);
@@ -717,11 +681,13 @@ void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
     s_time_t now = NOW();
     int i;
 
-    printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
-           (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); 
+	printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
+    SCHED_FN_VOID(dump_settings);
+    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now); 
     for (i = 0; i < smp_num_cpus; i++) {
         spin_lock_irqsave(&schedule_lock[i], flags);
-        printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
+        printk("CPU[%02d] ", i);
+        SCHED_FN_VOID(dump_cpu_state,i);
         dump_rqueue(&schedule_data[i].runqueue, "rq"); 
         spin_unlock_irqrestore(&schedule_lock[i], flags);
     }
diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h
index 8e84f38336..d06a9f641a 100644
--- a/xen/include/hypervisor-ifs/dom0_ops.h
+++ b/xen/include/hypervisor-ifs/dom0_ops.h
@@ -11,13 +11,14 @@
 #define __DOM0_OPS_H__
 
 #include "hypervisor-if.h"
+#include "sched-ctl.h"
 
 /*
  * Make sure you increment the interface version whenever you modify this file!
  * This makes sure that old versions of dom0 tools will stop working in a
  * well-defined way (rather than crashing the machine, for instance).
  */
-#define DOM0_INTERFACE_VERSION   0xAAAA0008
+#define DOM0_INTERFACE_VERSION   0xAAAA0009
 
 #define MAX_CMD_LEN       256
 #define MAX_DOMAIN_NAME    16
@@ -74,23 +75,13 @@ typedef struct dom0_builddomain_st
     full_execution_context_t ctxt;
 } dom0_builddomain_t;
 
-#define DOM0_BVTCTL            6
-typedef struct dom0_bvtctl_st
-{
-    /* IN variables. */
-    unsigned long ctx_allow;  /* context switch allowance */
-} dom0_bvtctl_t;
+#define DOM0_SCHEDCTL            6
+ /* struct sched_ctl_cmd is from sched-ctl.h   */
+typedef struct sched_ctl_cmd dom0_schedctl_t;
 
 #define DOM0_ADJUSTDOM         7
-typedef struct dom0_adjustdom_st
-{
-    /* IN variables. */
-    domid_t       domain;     /* domain id */
-    unsigned long mcu_adv;    /* mcu advance: inverse of weight */
-    unsigned long warp;       /* time warp */
-    unsigned long warpl;      /* warp limit */
-    unsigned long warpu;      /* unwarp time requirement */
-} dom0_adjustdom_t;
+/* struct sched_adjdom_cmd is from sched-ctl.h */
+typedef struct sched_adjdom_cmd dom0_adjustdom_t;
 
 #define DOM0_GETDOMAININFO    12
 typedef struct dom0_getdomaininfo_st
@@ -234,7 +225,7 @@ typedef struct dom0_op_st
         dom0_stopdomain_t       stopdomain;
         dom0_destroydomain_t    destroydomain;
         dom0_getmemlist_t       getmemlist;
-        dom0_bvtctl_t           bvtctl;
+        dom0_schedctl_t         schedctl;
         dom0_adjustdom_t        adjustdom;
         dom0_builddomain_t      builddomain;
         dom0_getdomaininfo_t    getdomaininfo;
diff --git a/xen/include/hypervisor-ifs/sched-ctl.h b/xen/include/hypervisor-ifs/sched-ctl.h
new file mode 100644
index 0000000000..bb0449908a
--- /dev/null
+++ b/xen/include/hypervisor-ifs/sched-ctl.h
@@ -0,0 +1,68 @@
+/**
+ * Generic scheduler control interface.
+ *
+ * Mark Williamson, (C) 2004 Intel Research Cambridge
+ */
+
+#ifndef _SCHED_CTL_H_
+#define _SCHED_CTL_H_
+
+/**
+ * When this file is changed, increment the version number.  This ensures that
+ * tools will refuse to work (rather than causing a crash) when they're
+ * out-of-sync with the Xen version number.
+ */
+#define SCHED_CTL_IF_VER 0x0001
+
+/* scheduler types */
+#define SCHED_BVT      0
+#define SCHED_ATROPOS  1
+#define SCHED_RROBIN   2
+
+/* generic scheduler control command - union of all scheduler control
+ * command structures */
+struct sched_ctl_cmd
+{
+    unsigned int if_ver;
+    unsigned int sched_id;
+    
+    union
+    {
+        struct bvt_ctl
+        {
+            /* IN variables. */
+            unsigned long ctx_allow;  /* context switch allowance */
+        } bvt;
+
+        struct rrobin_ctl
+        {
+            /* IN variables */
+            u64 slice;                /* round robin time slice */
+        } rrobin;
+    } u;
+};
+
+struct sched_adjdom_cmd
+{
+    unsigned int if_ver;
+    unsigned int sched_id;
+    domid_t domain;
+    
+    union
+    {
+        struct bvt_adjdom
+        {
+            unsigned long mcu_adv;    /* mcu advance: inverse of weight */
+            unsigned long warp;       /* time warp */
+            unsigned long warpl;      /* warp limit */
+            unsigned long warpu;      /* unwarp time requirement */
+        } bvt;
+
+        struct atropos_adjdom
+        {
+            int xtratime;
+        } atropos;
+    } u;
+};
+
+#endif /* _SCHED_CTL_H_ */
diff --git a/xen/include/xeno/sched-if.h b/xen/include/xeno/sched-if.h
new file mode 100644
index 0000000000..683e73d4f6
--- /dev/null
+++ b/xen/include/xeno/sched-if.h
@@ -0,0 +1,90 @@
+#include <asm/types.h>
+
+/*
+ * Additional declarations for the generic scheduler interface.  This should
+ * only be included by files that implement conforming schedulers.
+ *
+ * Portions by Mark Williamson are (C) 2004 Intel Research Cambridge
+ */
+
+#define BUCKETS 10
+
+typedef struct schedule_data_st
+{
+    struct list_head    runqueue;       /* runqueue */
+    struct task_struct *curr;           /* current task */
+    struct task_struct *idle;           /* idle task for this cpu */
+    void *              sched_priv;
+    struct ac_timer     s_timer;        /* scheduling timer  */
+#ifdef BUCKETS
+    u32                 hist[BUCKETS];  /* for scheduler latency histogram */
+#endif
+} __cacheline_aligned schedule_data_t;
+
+
+typedef struct task_slice_st
+{
+    struct task_struct *task;
+    s_time_t            time;
+} task_slice_t;
+
+struct scheduler
+{
+    char *name;             /* full name for this scheduler      */
+    char *opt_name;         /* option name for this scheduler    */
+    unsigned int sched_id;  /* ID for this scheduler             */
+
+    int          (*init_scheduler) ();
+    int          (*alloc_task)     (struct task_struct *);
+    void         (*add_task)       (struct task_struct *);
+    void         (*free_task)      (struct task_struct *);
+    void         (*rem_task)       (struct task_struct *);
+    void         (*wake_up)        (struct task_struct *);
+    /* XXX why does do_block need to return anything at all? */
+    long         (*do_block)       (struct task_struct *);
+    task_slice_t (*do_schedule)    (s_time_t);
+    int          (*control)        (struct sched_ctl_cmd *);
+    int          (*adjdom)         (struct task_struct *,
+                                    struct sched_adjdom_cmd *);
+    s32          (*reschedule)     (struct task_struct *);
+    void         (*dump_settings)  (void);
+    void         (*dump_cpu_state) (int);
+    void         (*dump_runq_el)   (struct task_struct *);
+};
+
+/* per CPU scheduler information */
+extern schedule_data_t schedule_data[];
+
+/*
+ * Wrappers for run-queue management. Must be called with the schedule_lock
+ * held.
+ */
+static inline void __add_to_runqueue_head(struct task_struct * p)
+{    
+    list_add(&p->run_list, &schedule_data[p->processor].runqueue);
+}
+
+static inline void __add_to_runqueue_tail(struct task_struct * p)
+{
+    list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
+}
+
+static inline void __del_from_runqueue(struct task_struct * p)
+{
+    list_del(&p->run_list);
+    p->run_list.next = NULL;
+}
+
+static inline int __task_on_runqueue(struct task_struct *p)
+{
+    return p->run_list.next != NULL;
+}
+
+#define next_domain(p) \\
+        list_entry((p)->run_list.next, struct task_struct, run_list)
+
+
+static inline int __runqueue_empty(int cpu)
+{
+    return list_empty(&schedule_data[cpu].runqueue);
+}
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index 5f728565ca..dea80d0833 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -114,16 +114,9 @@ struct task_struct
     s_time_t         wokenup;       /* time domain got woken up */
     struct ac_timer  timer;         /* one-shot timer for timeout values */
 
-    /* BVT scheduler specific. */
-    unsigned long mcu_advance;      /* inverse of weight */
-    u32           avt;              /* actual virtual time */
-    u32           evt;              /* effective virtual time */
-    int           warpback;         /* warp?  */
-    long          warp;             /* virtual time warp */
-    long          warpl;            /* warp limit */
-    long          warpu;            /* unwarp time requirement */
-    s_time_t      warped;           /* time it ran warped last time */
-    s_time_t      uwarped;          /* time it ran unwarped last time */
+    s_time_t         min_slice;     /* minimum time before reschedule */
+
+    void *sched_priv;               /* scheduler-specific data */
 
     /* Network I/O */
     net_vif_t *net_vif_list[MAX_DOMAIN_VIFS];
@@ -177,6 +170,7 @@ struct task_struct
 #define TASK_UNINTERRUPTIBLE     2
 #define TASK_STOPPED             4
 #define TASK_DYING               8
+#define TASK_SCHED_PRIV          16
 
 #include <asm/uaccess.h> /* for KERNEL_DS */
 
@@ -186,8 +180,6 @@ struct task_struct
     domain:      IDLE_DOMAIN_ID, \
     state:       TASK_RUNNING,   \
     has_cpu:     0,              \
-    evt:         0xffffffff,     \
-    avt:         0xffffffff,     \
     mm:          IDLE0_MM,       \
     addr_limit:  KERNEL_DS,      \
     thread:      INIT_THREAD,    \
@@ -202,9 +194,9 @@ extern struct task_struct *idle_task[NR_CPUS];
 
 #include <xeno/slab.h>
 
-extern kmem_cache_t *task_struct_cachep;
-#define alloc_task_struct()  \
-  ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
+void free_task_struct(struct task_struct *p);
+struct task_struct *alloc_task_struct();
+
 #define put_task_struct(_p) \
   if ( atomic_dec_and_test(&(_p)->refcnt) ) release_task(_p)
 #define get_task_struct(_p)  \
@@ -251,15 +243,14 @@ extern spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
 void scheduler_init(void);
 void schedulers_start(void);
 void sched_add_domain(struct task_struct *p);
-int sched_rem_domain(struct task_struct *p);
-long sched_bvtctl(unsigned long ctx_allow);
-long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp, 
-                  unsigned long warpl, unsigned long warpu);
+int  sched_rem_domain(struct task_struct *p);
+long sched_ctl(struct sched_ctl_cmd *);
+long sched_adjdom(struct sched_adjdom_cmd *);
 void init_idle_task(void);
 void __wake_up(struct task_struct *p);
 void wake_up(struct task_struct *p);
-unsigned long __reschedule(struct task_struct *p);
 void reschedule(struct task_struct *p);
+unsigned long __reschedule(struct task_struct *p);
 
 /* NB. Limited entry in Xen. Not for arbitrary use! */
 asmlinkage void __enter_scheduler(void);
@@ -302,4 +293,4 @@ extern struct task_struct *task_list;
 
 extern void update_process_times(int user);
 
-#endif
+#endif /*_LINUX_SCHED_H */
author	mwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>	2004-03-17 18:31:06 +0000
committer	mwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>	2004-03-17 18:31:06 +0000
commit	bee5b0bb130f42dabd8cbdcd035d8f737e725dbc (patch)
tree	99ac0cc05ceea17ead1d618190f88dfa33ea7f86
parent	8306baac6f817aea60eb6e7acfac96cbb007ed5a (diff)
download	xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.tar.gz xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.tar.bz2 xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.zip