aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>2004-03-17 18:31:06 +0000
committermwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>2004-03-17 18:31:06 +0000
commitbee5b0bb130f42dabd8cbdcd035d8f737e725dbc (patch)
tree99ac0cc05ceea17ead1d618190f88dfa33ea7f86
parent8306baac6f817aea60eb6e7acfac96cbb007ed5a (diff)
downloadxen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.tar.gz
xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.tar.bz2
xen-bee5b0bb130f42dabd8cbdcd035d8f737e725dbc.zip
bitkeeper revision 1.808 (4058996anVCLQRr3o_Adf9GqJybYSg)
Various updates related to the new generic scheduler API. The BVT scheduler has been ported to this API and a simple Round Robin scheduler has been added. There's a new generic control interface for setting scheduling parameters from userspace. Use the sched=xxx option at boot time to choose the scheduler. Default is BVT. The possibilities are "bvt" and "rrobin".
-rw-r--r--.rootkeys6
-rw-r--r--docs/interface.tex303
-rw-r--r--tools/xc/lib/xc.h7
-rw-r--r--tools/xc/lib/xc_atropos.c38
-rw-r--r--tools/xc/lib/xc_bvtsched.c19
-rw-r--r--tools/xc/lib/xc_private.h1
-rw-r--r--tools/xc/lib/xc_rrobin.c20
-rw-r--r--tools/xc/py/Xc.c71
-rw-r--r--xen/common/dom0_ops.c16
-rw-r--r--xen/common/domain.c3
-rw-r--r--xen/common/kernel.c3
-rw-r--r--xen/common/keyhandler.c10
-rw-r--r--xen/common/sched_bvt.c427
-rw-r--r--xen/common/sched_rrobin.c56
-rw-r--r--xen/common/schedule.c450
-rw-r--r--xen/include/hypervisor-ifs/dom0_ops.h25
-rw-r--r--xen/include/hypervisor-ifs/sched-ctl.h68
-rw-r--r--xen/include/xeno/sched-if.h90
-rw-r--r--xen/include/xeno/sched.h33
19 files changed, 1339 insertions, 307 deletions
diff --git a/.rootkeys b/.rootkeys
index b000ea2a90..d302f37329 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -71,6 +71,7 @@
3fbba6dbDfYvJSsw9500b4SZyUhxjQ tools/xc/lib/Makefile
3fbba6dc1uU7U3IFeF6A-XEOYF2MkQ tools/xc/lib/rpm.spec
3fbba6dcrNxtygEcgJYAJJ1gCQqfsA tools/xc/lib/xc.h
+40589968oCfoUlXd460CjVAkBE8IBA tools/xc/lib/xc_atropos.c
3fbba6dbEVkVMX0JuDFzap9jeaucGA tools/xc/lib/xc_bvtsched.c
3fbba6dbasJQV-MVElDC0DGSHMiL5w tools/xc/lib/xc_domain.c
40278d99BLsfUv3qxv0I8C1sClZ0ow tools/xc/lib/xc_elf.h
@@ -83,6 +84,7 @@
4051bce6CHAsYh8P5t2OHDtRWOP9og tools/xc/lib/xc_physdev.c
3fbba6dctWRWlFJkYb6hdix2X4WMuw tools/xc/lib/xc_private.c
3fbba6dcbVrG2hPzEzwdeV_UC8kydQ tools/xc/lib/xc_private.h
+40589968UQFnJeOMn8UIFLbXBuwXjw tools/xc/lib/xc_rrobin.c
3fbba6dcoGq9hQlksrBUfC2P5F6sGg tools/xc/lib/xc_vbd.c
3fbba6dc38q-ioRlwSR_quw4G3qUeQ tools/xc/lib/xc_vif.c
3fbd0a3dTwnDcfdw0-v46dPbX98zDw tools/xc/py/Makefile
@@ -169,6 +171,8 @@
4051bcecFeq4DE70p4zGO5setf47CA xen/common/physdev.c
4006e659i9j-doVxY7DKOGU4XVin1Q xen/common/rbtree.c
3ddb79bdHqdQpATqC0rmUZNbsb6L6A xen/common/resource.c
+40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
+40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
3ddb79bdB9RNMnkQnUyZ5C9hhMSQQw xen/common/slab.c
3ddb79bd0gVQYmL2zvuJnldvD0AGxQ xen/common/softirq.c
@@ -497,6 +501,7 @@
3ead095dE_VF-QA88rl_5cWYRWtRVQ xen/include/hypervisor-ifs/kbd.h
3ddb79c2oRPrzClk3zbTkRHlpumzKA xen/include/hypervisor-ifs/network.h
4051db79512nOCGweabrFWO2M2h5ng xen/include/hypervisor-ifs/physdev.h
+40589968wmhPmV5-ENbBYmMjnedgKw xen/include/hypervisor-ifs/sched-ctl.h
404f3d2eR2Owk-ZcGOx9ULGHg3nrww xen/include/hypervisor-ifs/trace.h
3f0d22cbroqp_BkoDPwkfRJhaw1LiQ xen/include/hypervisor-ifs/vbd.h
3ddb79c4qbCoOFHrv9sCGshbWzBVlQ xen/include/scsi/scsi.h
@@ -562,6 +567,7 @@
3ddb79c04nQVR3EYM5L4zxDV_MCo1g xen/include/xeno/prefetch.h
4006e65fWMwLqcocgik6wbF0Eeh0Og xen/include/xeno/rbtree.h
3e4540ccU1sgCx8seIMGlahmMfv7yQ xen/include/xeno/reboot.h
+40589969nPq3DMzv24RDb5LXE9brHw xen/include/xeno/sched-if.h
3ddb79c0LzqqS0LhAQ50ekgj4oGl7Q xen/include/xeno/sched.h
403a06a7H0hpHcKpAiDe5BPnaXWTlA xen/include/xeno/serial.h
3ddb79c0VDeD-Oft5eNfMneTU3D1dQ xen/include/xeno/skbuff.h
diff --git a/docs/interface.tex b/docs/interface.tex
index 2736a0412d..84003de1b6 100644
--- a/docs/interface.tex
+++ b/docs/interface.tex
@@ -353,7 +353,7 @@ create ``virtual disks'' on demand.
\subsection{Virtual Disk Management}
The VD management code consists of a set of python libraries. It can therefore
be accessed by custom scripts as well as the convenience scripts provided. The
-VD database is a SQLite database in /var/db/xen\_vdisk.sqlite.
+VD database is a SQLite database in /var/db/xen\_vdisks.sqlite.
The VD scripts and general VD usage are documented in the VBD-HOWTO.txt.
@@ -379,6 +379,307 @@ giving the page back to the hypervisor, or to use them for storing page tables.
and providing control interfaces for managing scheduling, networking, and
blocks.
+\chapter{CPU Scheduler}
+
+Xen offers a uniform API for CPU schedulers. It is possible to choose
+from a number of schedulers at boot and it should be easy to add more.
+
+\paragraph*{Note: SMP host support}
+Xen has always supported SMP host systems. Domains are statically assigned to
+CPUs, either at creation time or when manually pinning to a particular CPU.
+The current schedulers then run locally on each CPU to decide which of the
+assigned domains should be run there.
+
+\section{Standard Schedulers}
+
+These BVT and Round Robin schedulers are part of the normal Xen
+distribution. A port of the Atropos scheduler from the Nemesis
+operating system is almost complete and will be added shortly.
+
+\subsection{Borrowed Virtual Time (BVT)}
+
+This was the original Xen scheduler. BVT is designed for general-purpose
+environments but also provides support for latency-sensitive threads. It
+provides long-term weighted sharing but allows tasks a limited ability to
+``warp back'' in virtual time so that they are dispatched earlier.
+
+BVT can be activated by specifying {\tt sched=bvt} as a boot argument to Xen.
+
+\subsection{Round Robin}
+
+The round robin scheduler is a very simple example of some of the basic parts
+of the scheduler API.
+
+Round robin can be activated by specifying {\tt sched=rrobin} as a boot
+argument to Xen.
+
+\section{Scheduling API}
+
+The scheduling API is used by both the schedulers described above and should
+also be used by any new schedulers. It provides a generic interface and also
+implements much of the ``boilerplate'' code.
+
+\paragraph*{Note:} the scheduler API is currently undergoing active development,
+so there may be some changes to this API, although they are expected to be small.
+
+Schedulers conforming to this API are described by the following
+structure:
+
+\begin{verbatim}
+struct scheduler
+{
+ char *name; /* full name for this scheduler */
+ char *opt_name; /* option name for this scheduler */
+ unsigned int sched_id; /* ID for this scheduler */
+
+ int (*init_scheduler) ();
+ int (*alloc_task) (struct task_struct *);
+ void (*add_task) (struct task_struct *);
+ void (*free_task) (struct task_struct *);
+ void (*rem_task) (struct task_struct *);
+ void (*wake_up) (struct task_struct *);
+ long (*do_block) (struct task_struct *);
+ task_slice_t (*do_schedule) (s_time_t);
+ int (*control) (struct sched_ctl_cmd *);
+ int (*adjdom) (struct task_struct *,
+ struct sched_adjdom_cmd *);
+ s32 (*reschedule) (struct task_struct *);
+ void (*dump_settings) (void);
+ void (*dump_cpu_state) (int);
+ void (*dump_runq_el) (struct task_struct *);
+};
+\end{verbatim}
+
+The only method that {\em must} be implemented is
+{\tt do\_schedule()}. However, if there is not some implementation for the
+{\tt wake\_up()} method then waking tasks will not get put on the runqueue!
+
+The fields of the above structure are described in more detail below.
+
+\subsubsection{name}
+
+The name field is an arbitrary descriptive ASCII string.
+
+\subsubsection{opt\_name}
+
+This field is the value of the {\tt sched=} boot-time option that will select
+this scheduler.
+
+\subsubsection{sched\_id}
+
+This is an integer that uniquely identifies this scheduler. There should be a
+macro corrsponding to this scheduler ID in {\tt <hypervisor-ifs/sched-if.h>}.
+
+\subsubsection{init\_scheduler}
+
+\paragraph*{Purpose}
+
+This is a function for performing any scheduler-specific initialisation. For
+instance, it might allocate memory for per-CPU scheduler data and initialise it
+appropriately.
+
+\paragraph*{Call environment}
+
+This function is called after the initialisation performed by the generic
+layer. The function is called exactly once, for the scheduler that has been
+selected.
+
+\paragraph*{Return values}
+
+This should return negative on failure --- failure to initialise the scheduler
+will cause an immediate panic.
+
+\subsubsection{alloc\_task}
+
+\paragraph*{Purpose}
+This is called when a {\tt task\_struct} is allocated by the generic scheduler
+layer. A particular scheduler implementation may use this method to allocate
+per-task data for this task. It may use the {\tt sched\_priv} pointer in the
+{\tt task\_struct} to point to this data.
+
+\paragraph*{Call environment}
+The generic layer guarantees that the {\tt sched\_priv} field will
+remain intact from the time this method is called until the task is
+deallocated (so long as the scheduler implementation does not change
+it!).
+
+\paragraph*{Return values}
+Negative on failure.
+
+\subsubsection{add\_task}
+
+\paragraph*{Purpose}
+
+Called when a task is initially added by the generic layer.
+
+\paragraph*{Call environment}
+
+The fields in the {\tt task\_struct} are now filled out and available for use.
+Schedulers should implement appropriate initialisation of any per-task private
+information in this method.
+
+\subsubsection{free\_task}
+
+\paragraph*{Purpose}
+
+Schedulers should free the space used by any associated private data
+structures.
+
+\paragraph*{Call environment}
+
+This is called when a {\tt task\_struct} is about to be deallocated.
+The generic layer will have done generic task removal operations and
+(if implemented) called the scheduler's {\tt rem\_task} method before
+this method is called.
+
+\subsubsection{rem\_task}
+
+\paragraph*{Purpose}
+
+This is called when a task is being removed from scheduling.
+
+\subsubsection{wake\_up}
+
+\paragraph*{Purpose}
+
+Called when a task is woken up, this method should put the task on the runqueue
+(or do the scheduler-specific equivalent action).
+
+\paragraph*{Call environment}
+
+The generic layer guarantees that the task is already in state
+RUNNING.
+
+\subsubsection{do\_block}
+
+\paragraph*{Purpose}
+
+This function is called when a task is blocked. This function should
+not remove the task from the runqueue.
+
+\paragraph*{Call environment}
+
+The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to
+TASK\_INTERRUPTIBLE on entry to this method.
+
+\subsubsection{do\_schedule}
+
+This method must be implemented.
+
+\paragraph*{Purpose}
+
+The method is called each time a new task must be chosen for scheduling on the
+current CPU. The current time as passed as the single argument (the current
+task can be found using the {\tt current} variable).
+
+This method should select the next task to run on this CPU and set it's minimum
+time to run as well as returning the data described below.
+
+This method should also take the appropriate action if the previous
+task has blocked, e.g. removing it from the runqueue.
+
+\paragraph*{Call environment}
+
+The other fields in the {\tt task\_struct} are updated by the generic layer,
+which also performs all Xen-specific tasks and performs the actual task switch
+(unless the previous task has been chosen again).
+
+This method is called with the {\tt schedule\_lock} held for the current CPU
+and with interrupts disabled.
+
+\paragraph*{Return values}
+
+Must return a {\tt struct task\_slice} describing what task to run and how long
+for (at maximum).
+
+\subsubsection{control}
+
+\paragraph*{Purpose}
+
+This method is called for global scheduler control operations. It takes a
+pointer to a {\tt struct sched\_ctl\_cmd}, from which it should select the
+appropriate command data.
+
+\paragraph*{Call environment}
+
+The generic layer guarantees that when this method is called, the caller was
+using the same control interface version and that the caller selected the
+correct scheduler ID, hence the scheduler's implementation does not need to
+sanity-check these parts of the call.
+
+\paragraph*{Return values}
+
+This function should return the value to be passed back to user space, hence it
+should either be 0 or an appropriate errno value.
+
+\subsubsection{sched\_adjdom}
+
+\paragraph*{Purpose}
+
+This method is called to adjust the scheduling parameters of a particular
+domain.
+
+\paragraph*{Call environment}
+
+The generic layer guarantees that the caller has specified the correct
+control interface version and scheduler ID and that the supplied {\tt
+task\_struct} will not be deallocated during the call (hence it is not
+necessary to {\tt get\_task\_struct}).
+
+\paragraph*{Return values}
+
+This function should return the value to be passed back to user space, hence it
+should either be 0 or an appropriate errno value.
+
+\subsubsection{reschedule}
+
+\paragraph*{Purpose}
+
+This method is called to determine if a reschedule is required as a result of a
+particular task.
+
+\paragraph*{Call environment}
+The generic layer will cause a reschedule if the current domain is the idle
+task or it has exceeded its minimum time slice before a reschedule. The
+generic layer guarantees that the task passed is not currently running but is
+on the runqueue.
+
+\paragraph*{Return values}
+
+Should return a mask of CPUs to cause a reschedule on.
+
+\subsubsection{dump\_settings}
+
+\paragraph*{Purpose}
+
+If implemented, this should dump any private global settings for this
+scheduler to the console.
+
+\paragraph*{Call environment}
+
+This function is called with interrupts enabled.
+
+\subsubsection{dump\_cpu\_state}
+
+\paragraph*{Purpose}
+
+This method should dump any private settings for the specified CPU.
+
+\paragraph*{Call environment}
+
+This function is called with interrupts disabled and the {\tt schedule\_lock}
+for the specified CPU held.
+
+\subsubsection{dump\_runq\_el}
+
+\paragraph*{Purpose}
+
+This method should dump any private settings for the specified task.
+
+\paragraph*{Call environment}
+
+This function is called with interrupts disabled and the {\tt schedule\_lock}
+for the task's CPU held.
\chapter{Debugging}
diff --git a/tools/xc/lib/xc.h b/tools/xc/lib/xc.h
index fd1494dc13..aba2906842 100644
--- a/tools/xc/lib/xc.h
+++ b/tools/xc/lib/xc.h
@@ -81,6 +81,13 @@ int xc_bvtsched_domain_set(int xc_handle,
unsigned long warpl,
unsigned long warpu);
+int xc_atropos_domain_set(int xc_handle,
+ u64 domid,
+ int xtratime);
+
+int xc_rrobin_global_set(int xc_handle,
+ u64 slice);
+
typedef struct {
unsigned long credit_bytes;
unsigned long credit_usec;
diff --git a/tools/xc/lib/xc_atropos.c b/tools/xc/lib/xc_atropos.c
new file mode 100644
index 0000000000..b9ee719b0f
--- /dev/null
+++ b/tools/xc/lib/xc_atropos.c
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * xc_atropos.c
+ *
+ * API for manipulating parameters of the Atropos scheduler.
+ *
+ * by Mark Williamson, Copyright (c) 2004 Intel Research Cambridge.
+ */
+
+#include "xc_private.h"
+
+int xc_atropos_global_set(int xc_handle,
+ unsigned long ctx_allow)
+{
+ dom0_op_t op;
+ op.cmd = DOM0_SCHEDCTL;
+ op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
+ op.u.schedctl.sched_id = SCHED_BVT;
+
+ op.u.schedctl.u.bvt.ctx_allow = ctx_allow;
+ return do_dom0_op(xc_handle, &op);
+}
+
+int xc_atropos_domain_set(int xc_handle,
+ u64 domid, int xtratime)
+{
+ dom0_op_t op;
+ op.cmd = DOM0_ADJUSTDOM;
+
+ op.u.adjustdom.domain = (domid_t)domid;
+ op.u.adjustdom.if_ver = SCHED_CTL_IF_VER;
+ op.u.adjustdom.sched_id = SCHED_ATROPOS;
+
+ op.u.adjustdom.u.atropos.xtratime = xtratime;
+
+ printf("Doing dom0 op!\n");
+
+ return do_dom0_op(xc_handle, &op);
+}
diff --git a/tools/xc/lib/xc_bvtsched.c b/tools/xc/lib/xc_bvtsched.c
index 57554cd2e6..e38f9cf09e 100644
--- a/tools/xc/lib/xc_bvtsched.c
+++ b/tools/xc/lib/xc_bvtsched.c
@@ -12,8 +12,10 @@ int xc_bvtsched_global_set(int xc_handle,
unsigned long ctx_allow)
{
dom0_op_t op;
- op.cmd = DOM0_BVTCTL;
- op.u.bvtctl.ctx_allow = ctx_allow;
+ op.cmd = DOM0_SCHEDCTL;
+ op.u.schedctl.sched_id = SCHED_BVT;
+ op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
+ op.u.schedctl.u.bvt.ctx_allow = ctx_allow;
return do_dom0_op(xc_handle, &op);
}
@@ -25,11 +27,16 @@ int xc_bvtsched_domain_set(int xc_handle,
unsigned long warpu)
{
dom0_op_t op;
+ struct bvt_adjdom *adjptr = &op.u.adjustdom.u.bvt;
+
op.cmd = DOM0_ADJUSTDOM;
+ op.u.adjustdom.sched_id = SCHED_BVT;
+ op.u.adjustdom.if_ver = SCHED_CTL_IF_VER;
op.u.adjustdom.domain = (domid_t)domid;
- op.u.adjustdom.mcu_adv = mcuadv;
- op.u.adjustdom.warp = warp;
- op.u.adjustdom.warpl = warpl;
- op.u.adjustdom.warpu = warpu;
+
+ adjptr->mcu_adv = mcuadv;
+ adjptr->warp = warp;
+ adjptr->warpl = warpl;
+ adjptr->warpu = warpu;
return do_dom0_op(xc_handle, &op);
}
diff --git a/tools/xc/lib/xc_private.h b/tools/xc/lib/xc_private.h
index d5ce8947ea..859d3a7740 100644
--- a/tools/xc/lib/xc_private.h
+++ b/tools/xc/lib/xc_private.h
@@ -23,6 +23,7 @@
#include <dom0_ops.h>
#include <vbd.h>
#include <event_channel.h>
+#include <sched-ctl.h>
#define _PAGE_PRESENT 0x001
#define _PAGE_RW 0x002
diff --git a/tools/xc/lib/xc_rrobin.c b/tools/xc/lib/xc_rrobin.c
new file mode 100644
index 0000000000..4d986cee83
--- /dev/null
+++ b/tools/xc/lib/xc_rrobin.c
@@ -0,0 +1,20 @@
+/******************************************************************************
+ * xc_rrobin.c
+ *
+ * API for manipulating parameters of the Round Robin scheduler
+ *
+ * by Mark Williamson, Copyright (c) 2004 Intel Research Cambridge.
+ */
+
+#include "xc_private.h"
+
+int xc_rrobin_global_set(int xc_handle, u64 slice)
+{
+ dom0_op_t op;
+ op.cmd = DOM0_SCHEDCTL;
+ op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
+ op.u.schedctl.sched_id = SCHED_RROBIN;
+
+ op.u.schedctl.u.rrobin.slice = slice;
+ return do_dom0_op(xc_handle, &op);
+}
diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c
index 8ed16ee505..cef2a046ef 100644
--- a/tools/xc/py/Xc.c
+++ b/tools/xc/py/Xc.c
@@ -290,10 +290,10 @@ static PyObject *pyxc_bvtsched_domain_set(PyObject *self,
u64 dom;
unsigned long mcuadv, warp, warpl, warpu;
- static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl",
+ static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl",
"warpu", NULL };
- if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lllll", kwd_list,
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lllll", kwd_list,
&dom, &mcuadv, &warp, &warpl, &warpu) )
return NULL;
@@ -862,6 +862,49 @@ static PyObject *pyxc_physinfo(PyObject *self,
"cpu_khz", info.cpu_khz);
}
+static PyObject *pyxc_atropos_domain_set(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+ PyObject *ret_obj;
+ int xtratime;
+ u64 domid;
+
+ static char *kwd_list[] = { "dom", "xtratime", NULL };
+
+ if( !PyArg_ParseTupleAndKeywords(args, kwds, "Li", kwd_list, &domid,
+ &xtratime) )
+ return NULL;
+
+ if ( xc_atropos_domain_set(xc->xc_handle, domid, xtratime) != 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ Py_INCREF(zero);
+ return zero;
+}
+
+static PyObject *pyxc_rrobin_global_set(PyObject *self,
+ PyObject *args,
+ PyObject *kwds)
+{
+ XcObject *xc = (XcObject *)self;
+ PyObject *ret_obj;
+ u64 slice;
+
+ static char *kwd_list[] = { "slice", NULL };
+
+ if( !PyArg_ParseTupleAndKeywords(args, kwds, "L", kwd_list, &slice) )
+ return NULL;
+
+ if ( xc_rrobin_global_set(xc->xc_handle, slice) != 0 )
+ return PyErr_SetFromErrno(xc_error);
+
+ Py_INCREF(zero);
+ return zero;
+}
+
+
static PyMethodDef pyxc_methods[] = {
{ "domain_create",
(PyCFunction)pyxc_domain_create,
@@ -955,15 +998,15 @@ static PyMethodDef pyxc_methods[] = {
" cmdline [str, n/a]: Kernel parameters, if any.\n\n"
"Returns: [int] 0 on success; -1 on error.\n" },
- { "bvtsched_global_set",
- (PyCFunction)pyxc_bvtsched_global_set,
+ { "bvtsched_global_set",
+ (PyCFunction)pyxc_bvtsched_global_set,
METH_VARARGS | METH_KEYWORDS, "\n"
"Set global tuning parameters for Borrowed Virtual Time scheduler.\n"
" ctx_allow [int]: Minimal guaranteed quantum (I think!).\n\n"
"Returns: [int] 0 on success; -1 on error.\n" },
- { "bvtsched_domain_set",
- (PyCFunction)pyxc_bvtsched_domain_set,
+ { "bvtsched_domain_set",
+ (PyCFunction)pyxc_bvtsched_domain_set,
METH_VARARGS | METH_KEYWORDS, "\n"
"Set per-domain tuning parameters for Borrowed Virtual Time scheduler.\n"
" dom [long]: Identifier of domain to be tuned.\n"
@@ -973,6 +1016,22 @@ static PyMethodDef pyxc_methods[] = {
" warpu [int]: Internal BVT parameter.\n\n"
"Returns: [int] 0 on success; -1 on error.\n" },
+ { "atropos_domain_set",
+ (PyCFunction)pyxc_atropos_domain_set,
+ METH_VARARGS | METH_KEYWORDS, "\n"
+ "Set the extra time flag for a domain when running with Atropos.\n"
+ " dom [long]: domain to set\n"
+ " xtratime [int]: boolean\n"
+ "Returns: [int] 0 on success; -1 on error.\n" },
+
+ { "rrobin_global_set",
+ (PyCFunction)pyxc_rrobin_global_set,
+ METH_KEYWORDS, "\n"
+ "Set Round Robin scheduler slice.\n"
+ " slice [long]: Round Robin scheduler slice\n"
+ "Returns: [int] 0 on success, throws an exception on failure\n"
+ },
+
{ "vif_scheduler_set",
(PyCFunction)pyxc_vif_scheduler_set,
METH_VARARGS | METH_KEYWORDS, "\n"
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c
index b39ead491c..1d69f35bf3 100644
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -18,6 +18,7 @@
#include <asm/pdb.h>
#include <xeno/trace.h>
#include <xeno/console.h>
+#include <hypervisor-ifs/sched-ctl.h>
extern unsigned int alloc_new_dom_mem(struct task_struct *, unsigned int);
@@ -196,22 +197,15 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
}
break;
- case DOM0_BVTCTL:
+ case DOM0_SCHEDCTL:
{
- unsigned long ctx_allow = op->u.bvtctl.ctx_allow;
- ret = sched_bvtctl(ctx_allow);
+ ret = sched_ctl(&op->u.schedctl);
}
break;
case DOM0_ADJUSTDOM:
{
- domid_t dom = op->u.adjustdom.domain;
- unsigned long mcu_adv = op->u.adjustdom.mcu_adv;
- unsigned long warp = op->u.adjustdom.warp;
- unsigned long warpl = op->u.adjustdom.warpl;
- unsigned long warpu = op->u.adjustdom.warpu;
-
- ret = sched_adjdom(dom, mcu_adv, warp, warpl, warpu);
+ ret = sched_adjdom(&op->u.adjustdom);
}
break;
@@ -281,7 +275,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
if ( (p->state == TASK_STOPPED) || (p->state == TASK_DYING) )
op->u.getdomaininfo.state = DOMSTATE_STOPPED;
op->u.getdomaininfo.hyp_events = p->hyp_events;
- op->u.getdomaininfo.mcu_advance = p->mcu_advance;
+// op->u.getdomaininfo.mcu_advance = p->mcu_advance;
op->u.getdomaininfo.tot_pages = p->tot_pages;
op->u.getdomaininfo.cpu_time = p->cpu_time;
op->u.getdomaininfo.shared_info_frame =
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 8921ee246d..e61f02a26a 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -43,7 +43,6 @@ struct task_struct *do_createdomain(domid_t dom_id, unsigned int cpu)
if ( (p = alloc_task_struct()) == NULL )
return NULL;
- memset(p, 0, sizeof(*p));
atomic_set(&p->refcnt, 1);
@@ -496,7 +495,7 @@ void release_task(struct task_struct *p)
UNSHARE_PFN(virt_to_page(p->shared_info));
free_all_dom_mem(p);
- kmem_cache_free(task_struct_cachep, p);
+ free_task_struct(p);
}
diff --git a/xen/common/kernel.c b/xen/common/kernel.c
index f99f3fac32..5f2f27728f 100644
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -71,6 +71,8 @@ int opt_watchdog=0;
unsigned char opt_pdb[10] = "none";
/* opt_tbuf_size: trace buffer size (in pages) */
unsigned int opt_tbuf_size = 1;
+/* opt_sched: scheduler - default to Borrowed Virtual Time */
+char opt_sched[10] = "bvt";
static struct {
unsigned char *name;
@@ -91,6 +93,7 @@ static struct {
{ "watchdog", OPT_BOOL, &opt_watchdog },
{ "pdb", OPT_STR, &opt_pdb },
{ "tbuf_size", OPT_UINT, &opt_tbuf_size },
+ { "sched", OPT_STR, &opt_sched },
{ NULL, 0, NULL }
};
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 32786fdfd1..75f1e38c56 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -86,7 +86,15 @@ static char *task_states[] =
NULL,
NULL,
NULL,
- "Dying ",
+ "Dying ",
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ "Sched priv"
};
void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs)
diff --git a/xen/common/sched_bvt.c b/xen/common/sched_bvt.c
new file mode 100644
index 0000000000..f473e3f760
--- /dev/null
+++ b/xen/common/sched_bvt.c
@@ -0,0 +1,427 @@
+/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*-
+ ****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004 - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ * File: common/schedule.c
+ * Author: Rolf Neugebauer & Keir Fraser
+ * Updated for generic API by Mark Williamson
+ *
+ * Description: CPU scheduling
+ * implements A Borrowed Virtual Time scheduler.
+ * (see Duda & Cheriton SOSP'99)
+ */
+
+#include <xeno/config.h>
+#include <xeno/init.h>
+#include <xeno/lib.h>
+#include <xeno/sched.h>
+#include <xeno/delay.h>
+#include <xeno/event.h>
+#include <xeno/time.h>
+#include <xeno/ac_timer.h>
+#include <xeno/interrupt.h>
+#include <xeno/timer.h>
+#include <xeno/perfc.h>
+#include <xeno/sched-if.h>
+#include <xeno/slab.h>
+
+/* all per-domain BVT-specific scheduling info is stored here */
+struct bvt_dom_info
+{
+ unsigned long mcu_advance; /* inverse of weight */
+ u32 avt; /* actual virtual time */
+ u32 evt; /* effective virtual time */
+ int warpback; /* warp? */
+ long warp; /* virtual time warp */
+ long warpl; /* warp limit */
+ long warpu; /* unwarp time requirement */
+ s_time_t warped; /* time it ran warped last time */
+ s_time_t uwarped; /* time it ran unwarped last time */
+};
+
+struct bvt_cpu_info
+{
+ unsigned long svt; /* XXX check this is unsigned long! */
+};
+
+
+#define DOM_INF(p) ((struct bvt_dom_info *)(p)->sched_priv)
+#define CPU_INF(cpu) ((struct bvt_cpu_info *)(schedule_data[cpu]).sched_priv)
+#define CPU_SVT(cpu) (CPU_INF(cpu)->svt)
+
+#define MCU (s32)MICROSECS(100) /* Minimum unit */
+#define MCU_ADVANCE 10 /* default weight */
+#define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
+static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
+
+/* SLAB cache for struct bvt_dom_info objects */
+static kmem_cache_t *dom_info_cache;
+
+/*
+ * Calculate the effective virtual time for a domain. Take into account
+ * warping limits
+ */
+static void __calc_evt(struct bvt_dom_info *inf)
+{
+ s_time_t now = NOW();
+
+ if ( inf->warpback )
+ {
+ if ( ((now - inf->warped) < inf->warpl) &&
+ ((now - inf->uwarped) > inf->warpu) )
+ {
+ /* allowed to warp */
+ inf->evt = inf->avt - inf->warp;
+ }
+ else
+ {
+ /* warped for too long -> unwarp */
+ inf->evt = inf->avt;
+ inf->uwarped = now;
+ inf->warpback = 0;
+ }
+ }
+ else
+ {
+ inf->evt = inf->avt;
+ }
+}
+
+/**
+ * bvt_alloc_task - allocate BVT private structures for a task
+ * @p: task to allocate private structures for
+ *
+ * Returns non-zero on failure.
+ */
+int bvt_alloc_task(struct task_struct *p)
+{
+ DOM_INF(p)
+ = (struct bvt_dom_info *)kmem_cache_alloc(dom_info_cache,GFP_KERNEL);
+
+ if ( DOM_INF(p) == NULL )
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Add and remove a domain
+ */
+void bvt_add_task(struct task_struct *p)
+{
+ struct bvt_dom_info *inf = DOM_INF(p);
+
+ ASSERT(inf != NULL);
+ ASSERT(p != NULL);
+
+ inf->mcu_advance = MCU_ADVANCE;
+
+ if ( p->domain == IDLE_DOMAIN_ID )
+ {
+ inf->avt = inf->evt = ~0U;
+ }
+ else
+ {
+ /* Set avt and evt to system virtual time. */
+ inf->avt = CPU_SVT(p->processor);
+ inf->evt = CPU_SVT(p->processor);
+ /* Set some default values here. */
+ inf->warpback = 0;
+ inf->warp = 0;
+ inf->warpl = 0;
+ inf->warpu = 0;
+ }
+
+ return;
+}
+
+/**
+ * bvt_free_task - free BVT private structures for a task
+ * @p: task
+ */
+void bvt_free_task(struct task_struct *p)
+{
+ ASSERT( p->sched_priv != NULL );
+ kmem_cache_free( dom_info_cache, p->sched_priv );
+}
+
+
+void bvt_wake_up(struct task_struct *p)
+{
+ struct bvt_dom_info *inf = DOM_INF(p);
+
+ ASSERT(inf != NULL);
+
+
+ /* set the BVT parameters */
+ if (inf->avt < CPU_SVT(p->processor))
+ inf->avt = CPU_SVT(p->processor);
+
+ /* deal with warping here */
+ inf->warpback = 1;
+ inf->warped = NOW();
+ __calc_evt(inf);
+ __add_to_runqueue_head(p);
+}
+
+/*
+ * Block the currently-executing domain until a pertinent event occurs.
+ */
+static long bvt_do_block(struct task_struct *p)
+{
+ DOM_INF(p)->warpback = 0;
+ return 0;
+}
+
+/* Control the scheduler. */
+int bvt_ctl(struct sched_ctl_cmd *cmd)
+{
+ struct bvt_ctl *params = &cmd->u.bvt;
+
+ ctx_allow = params->ctx_allow;
+
+ return 0;
+}
+
+/* Adjust scheduling parameter for a given domain. */
+int bvt_adjdom(struct task_struct *p,
+ struct sched_adjdom_cmd *cmd)
+{
+ struct bvt_adjdom *params = &cmd->u.bvt;
+ unsigned long mcu_adv = params->mcu_adv,
+ warp = params->warp,
+ warpl = params->warpl,
+ warpu = params->warpu;
+
+ struct bvt_dom_info *inf = DOM_INF(p);
+
+ /* Sanity -- this can avoid divide-by-zero. */
+ if ( mcu_adv == 0 )
+ return -EINVAL;
+
+ spin_lock_irq(&schedule_lock[p->processor]);
+ inf->mcu_advance = mcu_adv;
+ inf->warp = warp;
+ inf->warpl = warpl;
+ inf->warpu = warpu;
+ spin_unlock_irq(&schedule_lock[p->processor]);
+
+ return 0;
+}
+
+
+/*
+ * The main function
+ * - deschedule the current domain.
+ * - pick a new domain.
+ * i.e., the domain with lowest EVT.
+ * The runqueue should be ordered by EVT so that is easy.
+ */
+static task_slice_t bvt_do_schedule(s_time_t now)
+{
+ struct task_struct *prev = current, *next = NULL, *next_prime, *p;
+ struct list_head *tmp;
+ int cpu = prev->processor;
+ s32 r_time; /* time for new dom to run */
+ s32 ranfor; /* assume we never run longer than 2.1s! */
+ s32 mcus;
+ u32 next_evt, next_prime_evt, min_avt;
+ struct bvt_dom_info *prev_inf = DOM_INF(prev),
+ *p_inf = NULL,
+ *next_inf = NULL,
+ *next_prime_inf = NULL;
+ task_slice_t ret;
+
+ ASSERT(prev->sched_priv != NULL);
+ ASSERT(prev_inf != NULL);
+
+ if ( likely(!is_idle_task(prev)) )
+ {
+ ranfor = (s32)(now - prev->lastschd);
+ /* Calculate mcu and update avt. */
+ mcus = (ranfor + MCU - 1) / MCU;
+ prev_inf->avt += mcus * prev_inf->mcu_advance;
+
+ __calc_evt(prev_inf);
+
+ __del_from_runqueue(prev);
+
+ if ( likely(prev->state == TASK_RUNNING) )
+ __add_to_runqueue_tail(prev);
+ }
+
+ /* We should at least have the idle task */
+ ASSERT(!list_empty(&schedule_data[cpu].runqueue));
+
+ /*
+ * scan through the run queue and pick the task with the lowest evt
+ * *and* the task the second lowest evt.
+ * this code is O(n) but we expect n to be small.
+ */
+ next = schedule_data[cpu].idle;
+ next_prime = NULL;
+
+ next_evt = ~0U;
+ next_prime_evt = ~0U;
+ min_avt = ~0U;
+
+ list_for_each ( tmp, &schedule_data[cpu].runqueue )
+ {
+ p = list_entry(tmp, struct task_struct, run_list);
+ p_inf = DOM_INF(p);
+
+ if ( p_inf->evt < next_evt )
+ {
+ next_prime = next;
+ next_prime_evt = next_evt;
+ next = p;
+ next_evt = p_inf->evt;
+ }
+ else if ( next_prime_evt == ~0U )
+ {
+ next_prime_evt = p_inf->evt;
+ next_prime = p;
+ }
+ else if ( p_inf->evt < next_prime_evt )
+ {
+ next_prime_evt = p_inf->evt;
+ next_prime = p;
+ }
+
+ /* Determine system virtual time. */
+ if ( p_inf->avt < min_avt )
+ min_avt = p_inf->avt;
+ }
+
+ /* Update system virtual time. */
+ if ( min_avt != ~0U )
+ CPU_SVT(cpu) = min_avt;
+
+ /* check for virtual time overrun on this cpu */
+ if ( CPU_SVT(cpu) >= 0xf0000000 )
+ {
+ u_long t_flags;
+ write_lock_irqsave(&tasklist_lock, t_flags);
+ for_each_domain ( p )
+ {
+ if ( p->processor == cpu )
+ {
+ p_inf->evt -= 0xe0000000;
+ p_inf->avt -= 0xe0000000;
+ }
+ }
+ write_unlock_irqrestore(&tasklist_lock, t_flags);
+ CPU_SVT(cpu) -= 0xe0000000;
+ }
+
+ /* work out time for next run through scheduler */
+ if ( is_idle_task(next) )
+ {
+ r_time = ctx_allow;
+ goto sched_done;
+ }
+
+ if ( (next_prime == NULL) || is_idle_task(next_prime) )
+ {
+ /* We have only one runnable task besides the idle task. */
+ r_time = 10 * ctx_allow; /* RN: random constant */
+ goto sched_done;
+ }
+
+ next_prime_inf = DOM_INF(next_prime);
+ next_inf = DOM_INF(next);
+
+ /*
+ * If we are here then we have two runnable tasks.
+ * Work out how long 'next' can run till its evt is greater than
+ * 'next_prime's evt. Take context switch allowance into account.
+ */
+ ASSERT(next_prime_inf->evt >= next_inf->evt);
+
+ r_time = ((next_prime_inf->evt - next_inf->evt)/next_inf->mcu_advance)
+ + ctx_allow;
+
+ ASSERT(r_time >= ctx_allow);
+
+ sched_done:
+ next->min_slice = ctx_allow;
+ ret.task = next;
+ ret.time = r_time;
+
+ return ret;
+}
+
+
+static void bvt_dump_runq_el(struct task_struct *p)
+{
+ struct bvt_dom_info *inf = DOM_INF(p);
+
+ printk("mcua=0x%04lX ev=0x%08X av=0x%08X ",
+ inf->mcu_advance, inf->evt, inf->avt);
+}
+
+static void bvt_dump_settings(void)
+{
+ printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
+}
+
+static void bvt_dump_cpu_state(int i)
+{
+ printk("svt=0x%08lX ", CPU_SVT(i));
+}
+
+
+/* Initialise the data structures. */
+int bvt_init_scheduler()
+{
+ int i;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ {
+ CPU_INF(i) = kmalloc(sizeof(struct bvt_cpu_info), GFP_KERNEL);
+
+ if ( CPU_INF(i) == NULL )
+ {
+ printk("Failed to allocate BVT scheduler private per-CPU memory!\n");
+ return -1;
+ }
+
+ CPU_SVT(i) = 0; /* XXX do I really need to do this? */
+ }
+
+ dom_info_cache = kmem_cache_create("BVT dom info",
+ sizeof(struct bvt_dom_info),
+ 0, 0, NULL, NULL);
+
+ if ( dom_info_cache == NULL )
+ {
+ printk("BVT: Failed to allocate domain info SLAB cache");
+ return -1;
+ }
+
+ return 0;
+}
+
+
+struct scheduler sched_bvt_def = {
+ .name = "Borrowed Virtual Time",
+ .opt_name = "bvt",
+ .sched_id = SCHED_BVT,
+
+ .init_scheduler = bvt_init_scheduler,
+ .alloc_task = bvt_alloc_task,
+ .add_task = bvt_add_task,
+ .free_task = bvt_free_task,
+ .wake_up = bvt_wake_up,
+ .do_block = bvt_do_block,
+ .do_schedule = bvt_do_schedule,
+ .control = bvt_ctl,
+ .adjdom = bvt_adjdom,
+ .dump_settings = bvt_dump_settings,
+ .dump_cpu_state = bvt_dump_cpu_state,
+ .dump_runq_el = bvt_dump_runq_el,
+};
+
diff --git a/xen/common/sched_rrobin.c b/xen/common/sched_rrobin.c
new file mode 100644
index 0000000000..ef4db60066
--- /dev/null
+++ b/xen/common/sched_rrobin.c
@@ -0,0 +1,56 @@
+/****************************************************************************
+ * Very stupid Round Robin Scheduler for Xen
+ *
+ * by Mark Williamson (C) 2004 Intel Research Cambridge
+ */
+
+#include <xeno/sched.h>
+#include <xeno/sched-if.h>
+#include <hypervisor-ifs/sched-ctl.h>
+#include <xeno/ac_timer.h>
+#include <xeno/time.h>
+
+static s_time_t rr_slice = MILLISECS(10);
+
+static task_slice_t rr_do_schedule(s_time_t now)
+{
+ struct task_struct *prev = current;
+ int cpu = current->processor;
+ task_slice_t ret;
+
+ __del_from_runqueue(prev);
+
+ if ( prev->state == TASK_RUNNING )
+ __add_to_runqueue_tail(prev);
+
+ ret.task = list_entry(schedule_data[cpu].runqueue.next,
+ struct task_struct, run_list);
+
+ ret.time = rr_slice;
+
+ return ret;
+}
+
+static int rr_ctl(struct sched_ctl_cmd *cmd)
+{
+ rr_slice = cmd->u.rrobin.slice;
+ return 0;
+}
+
+static void rr_dump_settings()
+{
+ printk("rr_slice = %llu ", rr_slice);
+}
+
+struct scheduler sched_rrobin_def = {
+ .name = "Stupid Round Robin Scheduler",
+ .opt_name = "rrobin",
+ .sched_id = SCHED_RROBIN,
+
+ .wake_up = __add_to_runqueue_head,
+ .do_schedule = rr_do_schedule,
+ .control = rr_ctl,
+ .dump_settings = rr_dump_settings,
+};
+
+
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 5487f15e5a..054123077c 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -2,14 +2,16 @@
****************************************************************************
* (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
* (C) 2002-2003 University of Cambridge
+ * (C) 2004 - Mark Williamson - Intel Research Cambridge
****************************************************************************
*
* File: common/schedule.c
* Author: Rolf Neugebauer & Keir Fraser
+ * Updated for generic API by Mark Williamson
*
- * Description: CPU scheduling
- * implements A Borrowed Virtual Time scheduler.
- * (see Duda & Cheriton SOSP'99)
+ * Description: Generic CPU scheduling code
+ * implements support functionality for the Xen scheduler API.
+ *
*/
#include <xeno/config.h>
@@ -23,6 +25,9 @@
#include <xeno/interrupt.h>
#include <xeno/timer.h>
#include <xeno/perfc.h>
+#include <xeno/sched-if.h>
+#include <hypervisor-ifs/sched-ctl.h>
+#include <xeno/trace.h>
/*#define WAKEUP_HISTO*/
/*#define BLOCKTIME_HISTO*/
@@ -33,23 +38,54 @@
#define BUCKETS 200
#endif
-#define MCU (s32)MICROSECS(100) /* Minimum unit */
-#define MCU_ADVANCE 10 /* default weight */
#define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
-static s32 ctx_allow = (s32)MILLISECS(5); /* context switch allowance */
-typedef struct schedule_data_st
-{
- struct list_head runqueue; /* runqueue */
- struct task_struct *curr; /* current task */
- struct task_struct *idle; /* idle task for this cpu */
- u32 svt; /* system virtual time. per CPU??? */
- struct ac_timer s_timer; /* scheduling timer */
-#ifdef BUCKETS
- u32 hist[BUCKETS]; /* for scheduler latency histogram */
-#endif
-} __cacheline_aligned schedule_data_t;
-static schedule_data_t schedule_data[NR_CPUS];
+/* XXX MAW pull trace-related #defines out of here and into an auto-generated
+ * header file later on! */
+#define TRC_SCHED_DOM_ADD 0x00010000
+#define TRC_SCHED_DOM_REM 0x00010001
+#define TRC_SCHED_WAKE 0x00010002
+#define TRC_SCHED_BLOCK 0x00010003
+#define TRC_SCHED_YIELD 0x00010004
+#define TRC_SCHED_SET_TIMER 0x00010005
+#define TRC_SCHED_CTL 0x00010006
+#define TRC_SCHED_ADJDOM 0x00010007
+#define TRC_SCHED_RESCHED 0x00010008
+#define TRC_SCHED_SWITCH 0x00010009
+#define TRC_SCHED_S_TIMER_FN 0x0001000A
+#define TRC_SCHED_T_TIMER_FN 0x0001000B
+#define TRC_SCHED_DOM_TIMER_FN 0x0001000C
+#define TRC_SCHED_FALLBACK_TIMER_FN 0x0001000D
+
+#define _HIGH32(_x) (_x >> 32)
+#define _LOW32(_x) ((u32)_x )
+
+/* Various timer handlers. */
+static void s_timer_fn(unsigned long unused);
+static void t_timer_fn(unsigned long unused);
+static void dom_timer_fn(unsigned long data);
+static void fallback_timer_fn(unsigned long unused);
+
+/* this is global for now so that private implementations can reach it */
+schedule_data_t schedule_data[NR_CPUS];
+
+/* XXX would be nice if the schedulers array could get populated
+ * automagically without having to hack the code in here */
+extern struct scheduler sched_bvt_def, sched_rrobin_def;
+static struct scheduler *schedulers[] = { &sched_bvt_def,
+ &sched_rrobin_def,
+ NULL};
+
+/* scheduler ops for the current scheduler */
+static struct scheduler ops;
+
+/* for scheduler functions that return void */
+#define SCHED_FN_VOID(fn, ...) do { if ( ops.fn ) ops.fn(__VA_ARGS__); } \
+ while (0)
+
+/* for scheduler functions that return a numeric value */
+#define SCHED_FN_RET(fn, ...) \
+ (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) : 0 )
spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
@@ -62,110 +98,78 @@ static struct ac_timer t_timer[NR_CPUS];
*/
static struct ac_timer fallback_timer[NR_CPUS];
-/* Various timer handlers. */
-static void s_timer_fn(unsigned long unused);
-static void t_timer_fn(unsigned long unused);
-static void dom_timer_fn(unsigned long data);
-static void fallback_timer_fn(unsigned long unused);
-
-/*
- * Wrappers for run-queue management. Must be called with the schedule_lock
- * held.
- */
-static inline void __add_to_runqueue_head(struct task_struct * p)
-{
- list_add(&p->run_list, &schedule_data[p->processor].runqueue);
-}
+extern kmem_cache_t *task_struct_cachep;
-static inline void __add_to_runqueue_tail(struct task_struct * p)
+void free_task_struct(struct task_struct *p)
{
- list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
+ SCHED_FN_VOID(free_task, p);
+ kmem_cache_free(task_struct_cachep, p);
}
-static inline void __del_from_runqueue(struct task_struct * p)
+/**
+ * alloc_task_struct - allocate a new task_struct and sched private structures
+ */
+struct task_struct *alloc_task_struct(void)
{
- list_del(&p->run_list);
- p->run_list.next = NULL;
-}
+ struct task_struct *p;
-static inline int __task_on_runqueue(struct task_struct *p)
-{
- return p->run_list.next != NULL;
-}
+ p=((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL));
-#define next_domain(p) \\
- list_entry((p)->run_list.next, struct task_struct, run_list)
+ if ( p == NULL )
+ return NULL;
-/*
- * Calculate the effective virtual time for a domain. Take into account
- * warping limits
- */
-static void __calc_evt(struct task_struct *p)
-{
- s_time_t now = NOW();
- if ( p->warpback )
- {
- if ( ((now - p->warped) < p->warpl) &&
- ((now - p->uwarped) > p->warpu) )
- {
- /* allowed to warp */
- p->evt = p->avt - p->warp;
- }
- else
- {
- /* warped for too long -> unwarp */
- p->evt = p->avt;
- p->uwarped = now;
- p->warpback = 0;
- }
- }
- else
+ memset(p, 0, sizeof(*p));
+
+ if ( SCHED_FN_RET(alloc_task, p) < 0)
{
- p->evt = p->avt;
+ kmem_cache_free(task_struct_cachep, p);
+ return NULL;
}
+
+ return p;
}
-
/*
* Add and remove a domain
*/
void sched_add_domain(struct task_struct *p)
{
p->state = TASK_STOPPED;
- p->mcu_advance = MCU_ADVANCE;
- if ( p->domain == IDLE_DOMAIN_ID )
- {
- p->avt = p->evt = ~0U;
- schedule_data[p->processor].idle = p;
- }
- else
+ if( p->domain != IDLE_DOMAIN_ID )
{
- /* Set avt end evt to system virtual time. */
- p->avt = schedule_data[p->processor].svt;
- p->evt = schedule_data[p->processor].svt;
- /* Set some default values here. */
- p->warpback = 0;
- p->warp = 0;
- p->warpl = 0;
- p->warpu = 0;
-
/* Initialise the per-domain timer. */
init_ac_timer(&p->timer);
p->timer.cpu = p->processor;
p->timer.data = (unsigned long)p;
p->timer.function = &dom_timer_fn;
-
}
+ else
+ {
+ schedule_data[p->processor].idle = p;
+ }
+
+ SCHED_FN_VOID(add_task, p);
+
+ TRACE_3D(TRC_SCHED_DOM_ADD, _HIGH32(p->domain), _LOW32(p->domain), p);
}
+/* XXX race condition here? we could both add and remove a domain at once, in
+ * theory. ick! */
+/* XXX is the task already removed from the runlist at this point? */
int sched_rem_domain(struct task_struct *p)
{
int x, y = p->state;
do {
if ( (x = y) == TASK_DYING ) return 0;
} while ( (y = cmpxchg(&p->state, x, TASK_DYING)) != x );
+
rem_ac_timer(&p->timer);
+
+ SCHED_FN_VOID(rem_task, p);
+
+ TRACE_3D(TRC_SCHED_DOM_REM, _HIGH32(p->domain), _LOW32(p->domain), p);
+
return 1;
}
@@ -174,6 +178,11 @@ void init_idle_task(void)
{
unsigned long flags;
struct task_struct *p = current;
+
+ if ( SCHED_FN_RET (alloc_task, p) < 0)
+ panic("Failed to allocate scheduler private data for idle task");
+ SCHED_FN_VOID(add_task, p);
+
spin_lock_irqsave(&schedule_lock[p->processor], flags);
p->has_cpu = 1;
p->state = TASK_RUNNING;
@@ -182,31 +191,25 @@ void init_idle_task(void)
spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
}
-
void __wake_up(struct task_struct *p)
{
+ TRACE_3D(TRC_SCHED_WAKE, _HIGH32(p->domain), _LOW32(p->domain), p);
+
ASSERT(p->state != TASK_DYING);
if ( unlikely(__task_on_runqueue(p)) )
return;
p->state = TASK_RUNNING;
- __add_to_runqueue_head(p);
-
- /* set the BVT parameters */
- if (p->avt < schedule_data[p->processor].svt)
- p->avt = schedule_data[p->processor].svt;
- /* deal with warping here */
- p->warpback = 1;
- p->warped = NOW();
- __calc_evt(p);
+ SCHED_FN_VOID(wake_up, p);
#ifdef WAKEUP_HISTO
p->wokenup = NOW();
#endif
}
+
void wake_up(struct task_struct *p)
{
unsigned long flags;
@@ -220,9 +223,10 @@ void wake_up(struct task_struct *p)
*/
static long do_block(void)
{
+ ASSERT(current->domain != IDLE_DOMAIN_ID);
set_bit(EVENTS_MASTER_ENABLE_BIT, &current->shared_info->events_mask);
current->state = TASK_INTERRUPTIBLE;
- current->warpback = 0;
+ TRACE_2D(TRC_SCHED_BLOCK, current->domain, current);
__enter_scheduler();
return 0;
}
@@ -232,6 +236,7 @@ static long do_block(void)
*/
static long do_yield(void)
{
+ TRACE_2D(TRC_SCHED_YIELD, current->domain, current);
__enter_scheduler();
return 0;
}
@@ -295,37 +300,56 @@ long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo)
add_ac_timer(&p->timer);
}
+ TRACE_5D(TRC_SCHED_SET_TIMER, _HIGH32(p->domain), _LOW32(p->domain),
+ p, timeout_hi, timeout_lo);
+
return 0;
}
-/* Control the scheduler. */
-long sched_bvtctl(unsigned long c_allow)
+/**
+ * sched_ctl - dispatch a scheduler control operation
+ * @cmd: the command passed in the dom0 op
+ *
+ * Given a generic scheduler control operation, call the control function for
+ * the scheduler in use, passing the appropriate control information from the
+ * union supplied.
+ */
+long sched_ctl(struct sched_ctl_cmd *cmd)
{
- ctx_allow = c_allow;
- return 0;
+ TRACE_0D(TRC_SCHED_CTL);
+
+ if ( cmd->if_ver != SCHED_CTL_IF_VER )
+ return -EACCES;
+
+ if ( cmd->sched_id != ops.sched_id )
+ return -EINVAL;
+
+ return SCHED_FN_RET(control, cmd);
}
+
/* Adjust scheduling parameter for a given domain. */
-long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp,
- unsigned long warpl, unsigned long warpu)
+long sched_adjdom(struct sched_adjdom_cmd *cmd)
{
- struct task_struct *p;
+ struct task_struct *p;
+
+ if ( cmd->if_ver != SCHED_CTL_IF_VER )
+ return -EACCES;
- /* Sanity -- this can avoid divide-by-zero. */
- if ( mcu_adv == 0 )
+ if ( cmd->sched_id != ops.sched_id )
return -EINVAL;
- p = find_domain_by_id(dom);
- if ( p == NULL )
+ p = find_domain_by_id(cmd->domain);
+
+ if( p == NULL )
return -ESRCH;
- spin_lock_irq(&schedule_lock[p->processor]);
- p->mcu_advance = mcu_adv;
- spin_unlock_irq(&schedule_lock[p->processor]);
+ TRACE_2D(TRC_SCHED_ADJDOM, _HIGH32(p->domain), _LOW32(p->domain));
- put_task_struct(p);
+ SCHED_FN_VOID(adjdom, p, cmd);
+ put_task_struct(p);
return 0;
}
@@ -339,17 +363,19 @@ long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp,
*/
unsigned long __reschedule(struct task_struct *p)
{
- int cpu = p->processor;
+ int cpu = p->processor;
struct task_struct *curr;
s_time_t now, min_time;
+ TRACE_3D(TRC_SCHED_RESCHED, _HIGH32(p->domain), _LOW32(p->domain), p);
+
if ( unlikely(p->has_cpu || !__task_on_runqueue(p)) )
return 0;
now = NOW();
curr = schedule_data[cpu].curr;
/* domain should run at least for ctx_allow */
- min_time = curr->lastschd + ctx_allow;
+ min_time = curr->lastschd + curr->min_slice;
if ( is_idle_task(curr) || (min_time <= now) )
{
@@ -362,161 +388,67 @@ unsigned long __reschedule(struct task_struct *p)
if ( schedule_data[cpu].s_timer.expires > min_time + TIME_SLOP )
mod_ac_timer(&schedule_data[cpu].s_timer, min_time);
- return 0;
+ return SCHED_FN_RET(reschedule, p);
}
-
void reschedule(struct task_struct *p)
{
unsigned long flags, cpu_mask;
+
spin_lock_irqsave(&schedule_lock[p->processor], flags);
cpu_mask = __reschedule(p);
+
spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
hyp_event_notify(cpu_mask);
}
-
/*
* The main function
- * - deschedule the current domain.
- * - pick a new domain.
- * i.e., the domain with lowest EVT.
- * The runqueue should be ordered by EVT so that is easy.
+ * - deschedule the current domain (scheduler independent).
+ * - pick a new domain (scheduler dependent).
*/
asmlinkage void __enter_scheduler(void)
{
- struct task_struct *prev = current, *next = NULL, *next_prime, *p;
- struct list_head *tmp;
+ struct task_struct *prev = current, *next = NULL;
int cpu = prev->processor;
s_time_t now;
+ task_slice_t next_slice;
s32 r_time; /* time for new dom to run */
- s32 ranfor; /* assume we never run longer than 2.1s! */
- s32 mcus;
- u32 next_evt, next_prime_evt, min_avt;
perfc_incrc(sched_run);
+ clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
+
spin_lock_irq(&schedule_lock[cpu]);
now = NOW();
rem_ac_timer(&schedule_data[cpu].s_timer);
-
+
ASSERT(!in_interrupt());
ASSERT(__task_on_runqueue(prev));
ASSERT(prev->state != TASK_UNINTERRUPTIBLE);
+ ASSERT(prev != NULL);
- if ( likely(!is_idle_task(prev)) )
+ if ( prev->state == TASK_INTERRUPTIBLE )
{
- ranfor = (s32)(now - prev->lastschd);
- prev->cpu_time += ranfor;
-
- /* Calculate mcu and update avt. */
- mcus = (ranfor + MCU - 1) / MCU;
- prev->avt += mcus * prev->mcu_advance;
-
- __calc_evt(prev);
-
- __del_from_runqueue(prev);
-
- if ( likely(prev->state == TASK_RUNNING) ||
- unlikely((prev->state == TASK_INTERRUPTIBLE) &&
- signal_pending(prev)) )
- {
+ /* this check is needed to avoid a race condition */
+ if ( signal_pending(prev) )
prev->state = TASK_RUNNING;
- __add_to_runqueue_tail(prev);
- }
+ else
+ SCHED_FN_VOID(do_block, prev);
}
- clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
-
- /* We should at least have the idle task */
- ASSERT(!list_empty(&schedule_data[cpu].runqueue));
-
- /*
- * scan through the run queue and pick the task with the lowest evt
- * *and* the task the second lowest evt.
- * this code is O(n) but we expect n to be small.
- */
- next = schedule_data[cpu].idle;
- next_prime = NULL;
-
- next_evt = ~0U;
- next_prime_evt = ~0U;
- min_avt = ~0U;
-
- list_for_each ( tmp, &schedule_data[cpu].runqueue )
- {
- p = list_entry(tmp, struct task_struct, run_list);
- if ( p->evt < next_evt )
- {
- next_prime = next;
- next_prime_evt = next_evt;
- next = p;
- next_evt = p->evt;
- }
- else if ( next_prime_evt == ~0U )
- {
- next_prime_evt = p->evt;
- next_prime = p;
- }
- else if ( p->evt < next_prime_evt )
- {
- next_prime_evt = p->evt;
- next_prime = p;
- }
-
- /* Determine system virtual time. */
- if ( p->avt < min_avt )
- min_avt = p->avt;
- }
+ /* get policy-specific decision on scheduling... */
+ next_slice = ops.do_schedule(now);
- /* Update system virtual time. */
- if ( min_avt != ~0U )
- schedule_data[cpu].svt = min_avt;
+ r_time = next_slice.time;
+ next = next_slice.task;
- /* check for virtual time overrun on this cpu */
- if ( schedule_data[cpu].svt >= 0xf0000000 )
- {
- u_long t_flags;
- write_lock_irqsave(&tasklist_lock, t_flags);
- for_each_domain ( p )
- {
- if ( p->processor == cpu )
- {
- p->evt -= 0xe0000000;
- p->avt -= 0xe0000000;
- }
- }
- write_unlock_irqrestore(&tasklist_lock, t_flags);
- schedule_data[cpu].svt -= 0xe0000000;
- }
-
- /* work out time for next run through scheduler */
- if ( is_idle_task(next) )
- {
- r_time = ctx_allow;
- goto sched_done;
- }
-
- if ( (next_prime == NULL) || is_idle_task(next_prime) )
- {
- /* We have only one runnable task besides the idle task. */
- r_time = 10 * ctx_allow; /* RN: random constant */
- goto sched_done;
- }
-
- /*
- * If we are here then we have two runnable tasks.
- * Work out how long 'next' can run till its evt is greater than
- * 'next_prime's evt. Take context switch allowance into account.
- */
- ASSERT(next_prime->evt >= next->evt);
-
- r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
+ if ( likely(!is_idle_task(prev)) )
+ prev->cpu_time += (now - prev->lastschd);
- sched_done:
- ASSERT(r_time >= ctx_allow);
+ /* now, switch to the new task... */
prev->has_cpu = 0;
next->has_cpu = 1;
@@ -537,7 +469,7 @@ asmlinkage void __enter_scheduler(void)
if ( unlikely(prev == next) )
return;
-
+
perfc_incrc(sched_ctx);
#if defined(WAKEUP_HISTO)
@@ -558,6 +490,10 @@ asmlinkage void __enter_scheduler(void)
}
#endif
+ TRACE_2D(TRC_SCHED_SWITCH, next->domain, next);
+
+ ASSERT(next->processor == current->processor);
+
switch_to(prev, next);
if ( unlikely(prev->state == TASK_DYING) )
@@ -591,6 +527,8 @@ int idle_cpu(int cpu)
/* The scheduler timer: force a run through the scheduler*/
static void s_timer_fn(unsigned long unused)
{
+ TRACE_0D(TRC_SCHED_S_TIMER_FN);
+
set_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events);
perfc_incrc(sched_irq);
}
@@ -600,6 +538,8 @@ static void t_timer_fn(unsigned long unused)
{
struct task_struct *p = current;
+ TRACE_0D(TRC_SCHED_T_TIMER_FN);
+
if ( !is_idle_task(p) )
set_bit(_EVENT_TIMER, &p->shared_info->events);
@@ -613,6 +553,8 @@ static void dom_timer_fn(unsigned long data)
unsigned long cpu_mask = 0;
struct task_struct *p = (struct task_struct *)data;
+ TRACE_0D(TRC_SCHED_DOM_TIMER_FN);
+
cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
guest_event_notify(cpu_mask);
}
@@ -623,6 +565,8 @@ static void fallback_timer_fn(unsigned long unused)
{
struct task_struct *p = current;
+ TRACE_0D(TRC_SCHED_FALLBACK_TIMER_FN);
+
if ( !is_idle_task(p) )
update_dom_time(p->shared_info);
@@ -660,6 +604,29 @@ void __init scheduler_init(void)
}
schedule_data[0].idle = &idle0_task;
+
+ extern char opt_sched[];
+
+ for ( i = 0; schedulers[i] != NULL; i++ )
+ {
+ ops = *schedulers[i]; /* fetch operations structure */
+
+ if(strcmp(ops.opt_name, opt_sched) == 0)
+ break;
+ }
+
+ if ( schedulers[i] == NULL )
+ printk("Could not find scheduler: %s\n", opt_sched);
+
+ printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
+
+ if ( ops.do_schedule == NULL)
+ panic("Chosen scheduler has NULL do_schedule!");
+
+ if ( SCHED_FN_RET(init_scheduler) < 0 )
+ panic("Initialising scheduler failed!");
+
+ SCHED_FN_VOID(add_task, &idle0_task);
}
/*
@@ -698,12 +665,9 @@ static void dump_rqueue(struct list_head *queue, char *name)
(unsigned long) queue->next, (unsigned long) queue->prev);
list_for_each (list, queue) {
p = list_entry(list, struct task_struct, run_list);
- printk("%3d: %llu has=%c mcua=0x%04lX"
- " ev=0x%08X av=0x%08X c=0x%X%08X\n",
- loop++, p->domain,
- p->has_cpu ? 'T':'F',
- p->mcu_advance, p->evt, p->avt,
- (u32)(p->cpu_time>>32), (u32)p->cpu_time);
+ printk("%3d: %llu has=%c ", loop++, p->domain, p->has_cpu ? 'T':'F');
+ SCHED_FN_VOID(dump_runq_el, p);
+ printk("c=0x%X%08X\n", (u32)(p->cpu_time>>32), (u32)p->cpu_time);
printk(" l: %lx n: %lx p: %lx\n",
(unsigned long)list, (unsigned long)list->next,
(unsigned long)list->prev);
@@ -717,11 +681,13 @@ void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
s_time_t now = NOW();
int i;
- printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
- (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now);
+ printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
+ SCHED_FN_VOID(dump_settings);
+ printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now);
for (i = 0; i < smp_num_cpus; i++) {
spin_lock_irqsave(&schedule_lock[i], flags);
- printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
+ printk("CPU[%02d] ", i);
+ SCHED_FN_VOID(dump_cpu_state,i);
dump_rqueue(&schedule_data[i].runqueue, "rq");
spin_unlock_irqrestore(&schedule_lock[i], flags);
}
diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h
index 8e84f38336..d06a9f641a 100644
--- a/xen/include/hypervisor-ifs/dom0_ops.h
+++ b/xen/include/hypervisor-ifs/dom0_ops.h
@@ -11,13 +11,14 @@
#define __DOM0_OPS_H__
#include "hypervisor-if.h"
+#include "sched-ctl.h"
/*
* Make sure you increment the interface version whenever you modify this file!
* This makes sure that old versions of dom0 tools will stop working in a
* well-defined way (rather than crashing the machine, for instance).
*/
-#define DOM0_INTERFACE_VERSION 0xAAAA0008
+#define DOM0_INTERFACE_VERSION 0xAAAA0009
#define MAX_CMD_LEN 256
#define MAX_DOMAIN_NAME 16
@@ -74,23 +75,13 @@ typedef struct dom0_builddomain_st
full_execution_context_t ctxt;
} dom0_builddomain_t;
-#define DOM0_BVTCTL 6
-typedef struct dom0_bvtctl_st
-{
- /* IN variables. */
- unsigned long ctx_allow; /* context switch allowance */
-} dom0_bvtctl_t;
+#define DOM0_SCHEDCTL 6
+ /* struct sched_ctl_cmd is from sched-ctl.h */
+typedef struct sched_ctl_cmd dom0_schedctl_t;
#define DOM0_ADJUSTDOM 7
-typedef struct dom0_adjustdom_st
-{
- /* IN variables. */
- domid_t domain; /* domain id */
- unsigned long mcu_adv; /* mcu advance: inverse of weight */
- unsigned long warp; /* time warp */
- unsigned long warpl; /* warp limit */
- unsigned long warpu; /* unwarp time requirement */
-} dom0_adjustdom_t;
+/* struct sched_adjdom_cmd is from sched-ctl.h */
+typedef struct sched_adjdom_cmd dom0_adjustdom_t;
#define DOM0_GETDOMAININFO 12
typedef struct dom0_getdomaininfo_st
@@ -234,7 +225,7 @@ typedef struct dom0_op_st
dom0_stopdomain_t stopdomain;
dom0_destroydomain_t destroydomain;
dom0_getmemlist_t getmemlist;
- dom0_bvtctl_t bvtctl;
+ dom0_schedctl_t schedctl;
dom0_adjustdom_t adjustdom;
dom0_builddomain_t builddomain;
dom0_getdomaininfo_t getdomaininfo;
diff --git a/xen/include/hypervisor-ifs/sched-ctl.h b/xen/include/hypervisor-ifs/sched-ctl.h
new file mode 100644
index 0000000000..bb0449908a
--- /dev/null
+++ b/xen/include/hypervisor-ifs/sched-ctl.h
@@ -0,0 +1,68 @@
+/**
+ * Generic scheduler control interface.
+ *
+ * Mark Williamson, (C) 2004 Intel Research Cambridge
+ */
+
+#ifndef _SCHED_CTL_H_
+#define _SCHED_CTL_H_
+
+/**
+ * When this file is changed, increment the version number. This ensures that
+ * tools will refuse to work (rather than causing a crash) when they're
+ * out-of-sync with the Xen version number.
+ */
+#define SCHED_CTL_IF_VER 0x0001
+
+/* scheduler types */
+#define SCHED_BVT 0
+#define SCHED_ATROPOS 1
+#define SCHED_RROBIN 2
+
+/* generic scheduler control command - union of all scheduler control
+ * command structures */
+struct sched_ctl_cmd
+{
+ unsigned int if_ver;
+ unsigned int sched_id;
+
+ union
+ {
+ struct bvt_ctl
+ {
+ /* IN variables. */
+ unsigned long ctx_allow; /* context switch allowance */
+ } bvt;
+
+ struct rrobin_ctl
+ {
+ /* IN variables */
+ u64 slice; /* round robin time slice */
+ } rrobin;
+ } u;
+};
+
+struct sched_adjdom_cmd
+{
+ unsigned int if_ver;
+ unsigned int sched_id;
+ domid_t domain;
+
+ union
+ {
+ struct bvt_adjdom
+ {
+ unsigned long mcu_adv; /* mcu advance: inverse of weight */
+ unsigned long warp; /* time warp */
+ unsigned long warpl; /* warp limit */
+ unsigned long warpu; /* unwarp time requirement */
+ } bvt;
+
+ struct atropos_adjdom
+ {
+ int xtratime;
+ } atropos;
+ } u;
+};
+
+#endif /* _SCHED_CTL_H_ */
diff --git a/xen/include/xeno/sched-if.h b/xen/include/xeno/sched-if.h
new file mode 100644
index 0000000000..683e73d4f6
--- /dev/null
+++ b/xen/include/xeno/sched-if.h
@@ -0,0 +1,90 @@
+#include <asm/types.h>
+
+/*
+ * Additional declarations for the generic scheduler interface. This should
+ * only be included by files that implement conforming schedulers.
+ *
+ * Portions by Mark Williamson are (C) 2004 Intel Research Cambridge
+ */
+
+#define BUCKETS 10
+
+typedef struct schedule_data_st
+{
+ struct list_head runqueue; /* runqueue */
+ struct task_struct *curr; /* current task */
+ struct task_struct *idle; /* idle task for this cpu */
+ void * sched_priv;
+ struct ac_timer s_timer; /* scheduling timer */
+#ifdef BUCKETS
+ u32 hist[BUCKETS]; /* for scheduler latency histogram */
+#endif
+} __cacheline_aligned schedule_data_t;
+
+
+typedef struct task_slice_st
+{
+ struct task_struct *task;
+ s_time_t time;
+} task_slice_t;
+
+struct scheduler
+{
+ char *name; /* full name for this scheduler */
+ char *opt_name; /* option name for this scheduler */
+ unsigned int sched_id; /* ID for this scheduler */
+
+ int (*init_scheduler) ();
+ int (*alloc_task) (struct task_struct *);
+ void (*add_task) (struct task_struct *);
+ void (*free_task) (struct task_struct *);
+ void (*rem_task) (struct task_struct *);
+ void (*wake_up) (struct task_struct *);
+ /* XXX why does do_block need to return anything at all? */
+ long (*do_block) (struct task_struct *);
+ task_slice_t (*do_schedule) (s_time_t);
+ int (*control) (struct sched_ctl_cmd *);
+ int (*adjdom) (struct task_struct *,
+ struct sched_adjdom_cmd *);
+ s32 (*reschedule) (struct task_struct *);
+ void (*dump_settings) (void);
+ void (*dump_cpu_state) (int);
+ void (*dump_runq_el) (struct task_struct *);
+};
+
+/* per CPU scheduler information */
+extern schedule_data_t schedule_data[];
+
+/*
+ * Wrappers for run-queue management. Must be called with the schedule_lock
+ * held.
+ */
+static inline void __add_to_runqueue_head(struct task_struct * p)
+{
+ list_add(&p->run_list, &schedule_data[p->processor].runqueue);
+}
+
+static inline void __add_to_runqueue_tail(struct task_struct * p)
+{
+ list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
+}
+
+static inline void __del_from_runqueue(struct task_struct * p)
+{
+ list_del(&p->run_list);
+ p->run_list.next = NULL;
+}
+
+static inline int __task_on_runqueue(struct task_struct *p)
+{
+ return p->run_list.next != NULL;
+}
+
+#define next_domain(p) \\
+ list_entry((p)->run_list.next, struct task_struct, run_list)
+
+
+static inline int __runqueue_empty(int cpu)
+{
+ return list_empty(&schedule_data[cpu].runqueue);
+}
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index 5f728565ca..dea80d0833 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -114,16 +114,9 @@ struct task_struct
s_time_t wokenup; /* time domain got woken up */
struct ac_timer timer; /* one-shot timer for timeout values */
- /* BVT scheduler specific. */
- unsigned long mcu_advance; /* inverse of weight */
- u32 avt; /* actual virtual time */
- u32 evt; /* effective virtual time */
- int warpback; /* warp? */
- long warp; /* virtual time warp */
- long warpl; /* warp limit */
- long warpu; /* unwarp time requirement */
- s_time_t warped; /* time it ran warped last time */
- s_time_t uwarped; /* time it ran unwarped last time */
+ s_time_t min_slice; /* minimum time before reschedule */
+
+ void *sched_priv; /* scheduler-specific data */
/* Network I/O */
net_vif_t *net_vif_list[MAX_DOMAIN_VIFS];
@@ -177,6 +170,7 @@ struct task_struct
#define TASK_UNINTERRUPTIBLE 2
#define TASK_STOPPED 4
#define TASK_DYING 8
+#define TASK_SCHED_PRIV 16
#include <asm/uaccess.h> /* for KERNEL_DS */
@@ -186,8 +180,6 @@ struct task_struct
domain: IDLE_DOMAIN_ID, \
state: TASK_RUNNING, \
has_cpu: 0, \
- evt: 0xffffffff, \
- avt: 0xffffffff, \
mm: IDLE0_MM, \
addr_limit: KERNEL_DS, \
thread: INIT_THREAD, \
@@ -202,9 +194,9 @@ extern struct task_struct *idle_task[NR_CPUS];
#include <xeno/slab.h>
-extern kmem_cache_t *task_struct_cachep;
-#define alloc_task_struct() \
- ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
+void free_task_struct(struct task_struct *p);
+struct task_struct *alloc_task_struct();
+
#define put_task_struct(_p) \
if ( atomic_dec_and_test(&(_p)->refcnt) ) release_task(_p)
#define get_task_struct(_p) \
@@ -251,15 +243,14 @@ extern spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
void scheduler_init(void);
void schedulers_start(void);
void sched_add_domain(struct task_struct *p);
-int sched_rem_domain(struct task_struct *p);
-long sched_bvtctl(unsigned long ctx_allow);
-long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp,
- unsigned long warpl, unsigned long warpu);
+int sched_rem_domain(struct task_struct *p);
+long sched_ctl(struct sched_ctl_cmd *);
+long sched_adjdom(struct sched_adjdom_cmd *);
void init_idle_task(void);
void __wake_up(struct task_struct *p);
void wake_up(struct task_struct *p);
-unsigned long __reschedule(struct task_struct *p);
void reschedule(struct task_struct *p);
+unsigned long __reschedule(struct task_struct *p);
/* NB. Limited entry in Xen. Not for arbitrary use! */
asmlinkage void __enter_scheduler(void);
@@ -302,4 +293,4 @@ extern struct task_struct *task_list;
extern void update_process_times(int user);
-#endif
+#endif /*_LINUX_SCHED_H */