aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>2004-04-19 14:52:17 +0000
committermwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>2004-04-19 14:52:17 +0000
commit49073372e47d8ed329bfe7b7542880ddbd8c2c59 (patch)
treef0bd00b43801d2d44cbbe897a24dfe964562aaca
parent6294a3f8ca8be94d5dbfa2102289d7c9acc12c00 (diff)
downloadxen-49073372e47d8ed329bfe7b7542880ddbd8c2c59.tar.gz
xen-49073372e47d8ed329bfe7b7542880ddbd8c2c59.tar.bz2
xen-49073372e47d8ed329bfe7b7542880ddbd8c2c59.zip
bitkeeper revision 1.825.9.1 (4083e7a1nl5KCSLH_RLGt0IXz-sw8g)
Various scheduler updates. Main points: modify Atropos unblocking behaviour, add further documentation, add support for Atropos and Round-robin to xc_dom_control.py
-rw-r--r--.rootkeys1
-rw-r--r--docs/Sched-HOWTO.txt135
-rw-r--r--docs/Xen-HOWTO.txt5
-rw-r--r--docs/interface.tex79
-rw-r--r--tools/examples/README22
-rwxr-xr-xtools/examples/xc_dom_control.py18
-rw-r--r--tools/xc/lib/xc_atropos.c8
-rw-r--r--tools/xc/py/Xc.c6
-rw-r--r--xen/common/sched_atropos.c174
-rw-r--r--xen/include/hypervisor-ifs/dom0_ops.h2
-rw-r--r--xen/include/hypervisor-ifs/sched_ctl.h4
11 files changed, 326 insertions, 128 deletions
diff --git a/.rootkeys b/.rootkeys
index 57ac0c7f50..4c17a634ea 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -8,6 +8,7 @@
3f69d8abYB1vMyD_QVDvzxy5Zscf1A TODO
405ef604hIZH5pGi2uwlrlSvUMrutw docs/Console-HOWTO.txt
3f9e7d53iC47UnlfORp9iC1vai6kWw docs/Makefile
+4083e798FbE1MIsQaIYvjnx1uvFhBg docs/Sched-HOWTO.txt
40083bb4LVQzRqA3ABz0__pPhGNwtA docs/VBD-HOWTO.txt
4021053fmeFrEyPHcT8JFiDpLNgtHQ docs/Xen-HOWTO.txt
3f9e7d60PWZJeVh5xdnk0nLUdxlqEA docs/eps/xenlogo.eps
diff --git a/docs/Sched-HOWTO.txt b/docs/Sched-HOWTO.txt
new file mode 100644
index 0000000000..42d6238f17
--- /dev/null
+++ b/docs/Sched-HOWTO.txt
@@ -0,0 +1,135 @@
+Xen Scheduler HOWTO
+===================
+
+by Mark Williamson
+(c) 2004 Intel Research Cambridge
+
+
+Introduction
+------------
+
+Xen offers a choice of CPU schedulers. All available schedulers are
+included in Xen at compile time and the administrator may select a
+particular scheduler using a boot-time parameter to Xen. It is
+expected that administrators will choose the scheduler most
+appropriate to their application and configure the machine to boot
+with that scheduler.
+
+Note: the default scheduler is the Borrowed Virtual Time (BVT)
+scheduler which was also used in previous releases of Xen. No
+configuration changes are required to keep using this scheduler.
+
+This file provides a brief description of the CPU schedulers available
+in Xen, what they are useful for and the parameters that are used to
+configure them. This information is necessarily fairly technical at
+the moment. The recommended way to fully understand the scheduling
+algorithms is to read the relevant research papers.
+
+The interface to the schedulers is basically "raw" at the moment,
+without sanity checking - administrators should be careful when
+setting the parameters since it is possible for a mistake to hang
+domains, or the entire system (in particular, double check parameters
+for sanity and make sure that DOM0 will get enough CPU time to remain
+usable). Note that xc_dom_control.py takes time values in
+nanoseconds.
+
+Future tools will implement friendlier control interfaces.
+
+
+Borrowed Virtual Time (BVT)
+---------------------------
+
+All releases of Xen have featured the BVT scheduler, which is used to
+provide proportional fair shares of the CPU based on weights assigned
+to domains. BVT is "work conserving" - the CPU will never be left
+idle if there are runnable tasks.
+
+BVT uses "virtual time" to make decisions on which domain should be
+scheduled on the processor. Each time a scheduling decision is
+required, BVT evaluates the "Effective Virtual Time" of all domains
+and then schedules the domain with the least EVT. Domains are allowed
+to "borrow" virtual time by "time warping", which reduces their EVT by
+a certain amount, so that they may be scheduled sooner. In order to
+maintain long term fairness, there are limits on when a domain can
+time warp and for how long. [ For more details read the SOSP'99 paper
+by Duda and Cheriton ]
+
+In the Xen implementation, domains time warp when they unblock, so
+that domain wakeup latencies are reduced.
+
+The BVT algorithm uses the following per-domain parameters (set using
+xc_dom_control.py cpu_bvtset):
+
+* mcuadv - the MCU (Minimum Charging Unit) advance determines the
+ proportional share of the CPU that a domain receives. It
+ is set inversely proportionally to a domain's sharing weight.
+* warp - the amount of "virtual time" the domain is allowed to warp
+ backwards
+* warpl - the warp limit is the maximum time a domain can run warped for
+* warpu - the unwarp requirement is the minimum time a domain must
+ run unwarped for before it can warp again
+
+BVT also has the following global parameter (set using
+xc_dom_control.py cpu_bvtslice):
+
+* ctx_allow - the context switch allowance is similar to the "quantum"
+ in traditional schedulers. It is the minimum time that
+ a scheduled domain will be allowed to run before be
+ pre-empted. This prevents thrashing of the CPU.
+
+BVT can now be selected by passing the 'sched=bvt' argument to Xen at
+boot-time and is the default scheduler if no 'sched' argument is
+supplied.
+
+Atropos
+-------
+
+Atropos is a scheduler originally developed for the Nemesis multimedia
+operating system. Atropos can be used to reserve absolute shares of
+the CPU. It also includes some features to improve the efficiency of
+domains that block for I/O and to allow spare CPU time to be shared
+out.
+
+The Atropos algorithm has the following parameters for each domain
+(set using xc_dom_control.py cpu_atropos_set):
+
+ * slice - The length of time per period that a domain is guaranteed.
+ * period - The period over which a domain is guaranteed to receive
+ its slice of CPU time.
+ * latency - The latency hint is used to control how soon after
+ waking up a domain should be scheduled.
+ * xtratime - This is a true (1) / false (0) flag that specifies whether
+ a domain should be allowed a share of the system slack time.
+
+Every domain has an associated period and slice. The domain should
+receive 'slice' nanoseconds every 'period' nanoseconds. This allows
+the administrator to configure both the absolute share of the CPU a
+domain receives and the frequency with which it is scheduled. When
+domains unblock, their period is reduced to the value of the latency
+hint (the slice is scaled accordingly so that they still get the same
+proportion of the CPU). For each subsequent period, the slice and
+period times are doubled until they reach their original values.
+
+Atropos is selected by adding 'sched=atropos' to Xen's boot-time
+arguments.
+
+Note: don't overcommit the CPU when using Atropos (i.e. don't reserve
+more CPU than is available - the utilisation should be kept to
+slightly less than 100% in order to ensure predictable behaviour).
+
+Round-Robin
+-----------
+
+The Round-Robin scheduler is provided as a simple example of Xen's
+internal scheduler API. For production systems, one of the other
+schedulers should be used, since they are more flexible and more
+efficient.
+
+The Round-robin scheduler has one global parameter (set using
+xc_dom_control.py cpu_rrobin_slice):
+
+ * rr_slice - The time for which each domain runs before the next
+ scheduling decision is made.
+
+The Round-Robin scheduler can be selected by adding 'sched=rrobin' to
+Xen's boot-time arguments.
diff --git a/docs/Xen-HOWTO.txt b/docs/Xen-HOWTO.txt
index 3665a2ce79..af88c12a62 100644
--- a/docs/Xen-HOWTO.txt
+++ b/docs/Xen-HOWTO.txt
@@ -231,6 +231,11 @@ The following is a list of command line arguments to pass to Xen:
enabled in debug builds. Most users can ignore
this feature completely.
+ sched=xxx Select the CPU scheduler Xen should use. The current
+ possibilities are 'bvt', 'atropos' and 'rrobin'. The
+ default is 'bvt'. For more information see
+ Sched-HOWTO.txt.
+
Boot into Domain 0
==============================
diff --git a/docs/interface.tex b/docs/interface.tex
index 2df158fff8..46da27626a 100644
--- a/docs/interface.tex
+++ b/docs/interface.tex
@@ -392,26 +392,14 @@ assigned domains should be run there.
\section{Standard Schedulers}
-These BVT and Round Robin schedulers are part of the normal Xen
-distribution. A port of the Atropos scheduler from the Nemesis
-operating system is almost complete and will be added shortly.
+These BVT, Atropos and Round Robin schedulers are part of the normal
+Xen distribution. BVT provides porportional fair shares of the CPU to
+the running domains. Atropos can be used to reserve absolute shares
+of the CPU for each domain. Round-robin is provided as an example of
+Xen's internal scheduler API.
-\subsection{Borrowed Virtual Time (BVT)}
-
-This was the original Xen scheduler. BVT is designed for general-purpose
-environments but also provides support for latency-sensitive threads. It
-provides long-term weighted sharing but allows tasks a limited ability to
-``warp back'' in virtual time so that they are dispatched earlier.
-
-BVT can be activated by specifying {\tt sched=bvt} as a boot argument to Xen.
-
-\subsection{Round Robin}
-
-The round robin scheduler is a very simple example of some of the basic parts
-of the scheduler API.
-
-Round robin can be activated by specifying {\tt sched=rrobin} as a boot
-argument to Xen.
+More information on the characteristics and use of these schedulers is
+available in { \tt Sched-HOWTO.txt }.
\section{Scheduling API}
@@ -419,9 +407,6 @@ The scheduling API is used by both the schedulers described above and should
also be used by any new schedulers. It provides a generic interface and also
implements much of the ``boilerplate'' code.
-\paragraph*{Note:} the scheduler API is currently undergoing active development,
-so there may be some changes to this API, although they are expected to be small.
-
Schedulers conforming to this API are described by the following
structure:
@@ -438,7 +423,7 @@ struct scheduler
void (*free_task) (struct task_struct *);
void (*rem_task) (struct task_struct *);
void (*wake_up) (struct task_struct *);
- long (*do_block) (struct task_struct *);
+ void (*do_block) (struct task_struct *);
task_slice_t (*do_schedule) (s_time_t);
int (*control) (struct sched_ctl_cmd *);
int (*adjdom) (struct task_struct *,
@@ -458,7 +443,7 @@ The fields of the above structure are described in more detail below.
\subsubsection{name}
-The name field is an arbitrary descriptive ASCII string.
+The name field should point to a descriptive ASCII string.
\subsubsection{opt\_name}
@@ -486,22 +471,22 @@ selected.
\paragraph*{Return values}
-This should return negative on failure --- failure to initialise the scheduler
-will cause an immediate panic.
+This should return negative on failure --- this will cause an
+immediate panic and the system will fail to boot.
\subsubsection{alloc\_task}
\paragraph*{Purpose}
-This is called when a {\tt task\_struct} is allocated by the generic scheduler
-layer. A particular scheduler implementation may use this method to allocate
-per-task data for this task. It may use the {\tt sched\_priv} pointer in the
-{\tt task\_struct} to point to this data.
+Called when a {\tt task\_struct} is allocated by the generic scheduler
+layer. A particular scheduler implementation may use this method to
+allocate per-task data for this task. It may use the {\tt
+sched\_priv} pointer in the {\tt task\_struct} to point to this data.
\paragraph*{Call environment}
The generic layer guarantees that the {\tt sched\_priv} field will
remain intact from the time this method is called until the task is
deallocated (so long as the scheduler implementation does not change
-it!).
+it explicitly!).
\paragraph*{Return values}
Negative on failure.
@@ -536,7 +521,8 @@ this method is called.
\paragraph*{Purpose}
-This is called when a task is being removed from scheduling.
+This is called when a task is being removed from scheduling (but is
+not yet being freed).
\subsubsection{wake\_up}
@@ -547,8 +533,7 @@ Called when a task is woken up, this method should put the task on the runqueue
\paragraph*{Call environment}
-The generic layer guarantees that the task is already in state
-RUNNING.
+The task is already set to state RUNNING.
\subsubsection{do\_block}
@@ -560,7 +545,9 @@ not remove the task from the runqueue.
\paragraph*{Call environment}
The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to
-TASK\_INTERRUPTIBLE on entry to this method.
+TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt
+ do\_schedule} method will be made after this method returns, in
+order to select the next task to run.
\subsubsection{do\_schedule}
@@ -570,7 +557,7 @@ This method must be implemented.
The method is called each time a new task must be chosen for scheduling on the
current CPU. The current time as passed as the single argument (the current
-task can be found using the {\tt current} variable).
+task can be found using the {\tt current} macro).
This method should select the next task to run on this CPU and set it's minimum
time to run as well as returning the data described below.
@@ -585,7 +572,7 @@ which also performs all Xen-specific tasks and performs the actual task switch
(unless the previous task has been chosen again).
This method is called with the {\tt schedule\_lock} held for the current CPU
-and with interrupts disabled.
+and local interrupts interrupts disabled.
\paragraph*{Return values}
@@ -597,15 +584,16 @@ for (at maximum).
\paragraph*{Purpose}
This method is called for global scheduler control operations. It takes a
-pointer to a {\tt struct sched\_ctl\_cmd}, from which it should select the
-appropriate command data.
+pointer to a {\tt struct sched\_ctl\_cmd}, which it should either
+source data from or populate with data, depending on the value of the
+{\tt direction} field.
\paragraph*{Call environment}
-The generic layer guarantees that when this method is called, the caller was
-using the same control interface version and that the caller selected the
-correct scheduler ID, hence the scheduler's implementation does not need to
-sanity-check these parts of the call.
+The generic layer guarantees that when this method is called, the
+caller was using the caller selected the correct scheduler ID, hence
+the scheduler's implementation does not need to sanity-check these
+parts of the call.
\paragraph*{Return values}
@@ -617,7 +605,9 @@ should either be 0 or an appropriate errno value.
\paragraph*{Purpose}
This method is called to adjust the scheduling parameters of a particular
-domain.
+domain, or to query their current values. The function should check
+the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in
+order to determine which of these operations is being performed.
\paragraph*{Call environment}
@@ -681,6 +671,7 @@ This method should dump any private settings for the specified task.
This function is called with interrupts disabled and the {\tt schedule\_lock}
for the task's CPU held.
+
\chapter{Debugging}
Xen provides tools for debugging both Xen and guest OSes. Currently, the
diff --git a/tools/examples/README b/tools/examples/README
index 565b5c5c76..117f1091ef 100644
--- a/tools/examples/README
+++ b/tools/examples/README
@@ -16,27 +16,31 @@ send it (preferably with a little summary to go in this file) to
xc_dom_control.py
- general tool for controling running domains
Usage: xc_dom_control.py [command] <params>
+
stop [dom] -- pause a domain
start [dom] -- un-pause a domain
- shutdown [dom] -- request a domain to shutdown (can specify 'all')
+ shutdown [dom] [[-w]] -- request a domain to shutdown (can specify 'all')
(optionally wait for complete shutdown)
destroy [dom] -- immediately terminate a domain
pincpu [dom] [cpu] -- pin a domain to the specified CPU
suspend [dom] [file] -- write domain's memory to a file and terminate
(resume by re-running xc_dom_create with -L option)
- restore [file] -- resume a domain from a file
+ unwatch [dom] -- kill the auto-restart daemon for a domain
list -- print info about all domains
listvbds -- print info about all virtual block devs
cpu_bvtset [dom] [mcuadv] [warp] [warpl] [warpu]
- -- set scheduling parameters for domain
- cpu_bvtslice [slice] -- default scheduler slice
+ -- set BVT scheduling parameters for domain
+ cpu_bvtslice [slice] -- set default BVT scheduler slice
+ cpu_atropos_set [dom] [period] [slice] [latency] [xtratime]
+ -- set Atropos scheduling parameters for domain
+ cpu_rrobin_slice [slice] -- set Round Robin scheduler slice
vif_stats [dom] [vif] -- get stats for a given network vif
vif_addip [dom] [vif] [ip] -- add an IP address to a given vif
vif_setsched [dom] [vif] [bytes] [usecs] -- rate limit vif bandwidth
vif_getsched [dom] [vif] -- print vif's scheduling parameters
vbd_add [dom] [uname] [dev] [mode] -- make disk/partition uname available to
domain as dev e.g. 'vbd_add 2 phy:sda3 hda1 w'
- vbd_remove [dom] [dev] -- remove disk or partition attached as 'dev'
+ vbd_remove [dom] [dev] -- remove disk or partition attached as 'dev'
xc_dom_create.py
@@ -89,7 +93,7 @@ Args to override the kernel command line, which is concatenated from these:
-xc_vd_tool
+xc_vd_tool.py
- tool for manipulating virtual disks
Usage: xc_vd_tool command <params>
@@ -126,3 +130,9 @@ This is a Sys-V init script for RedHat systems.
On a RedHat system it should be possible to issue commands to this
script using the "service" command and to configure if / when it is
run automatically, using the "chkconfig" command.
+
+xend
+This is a Sys-V init script for RedHat systems, which can be used to
+start the Xen Daemon (xend) at boot time.
+
+ - Usage: xend {start|stop|status|restart|reload}
diff --git a/tools/examples/xc_dom_control.py b/tools/examples/xc_dom_control.py
index 0ca2c97ac2..19a2e16046 100755
--- a/tools/examples/xc_dom_control.py
+++ b/tools/examples/xc_dom_control.py
@@ -21,8 +21,11 @@ Usage: %s [command] <params>
list -- print info about all domains
listvbds -- print info about all virtual block devs
cpu_bvtset [dom] [mcuadv] [warp] [warpl] [warpu]
- -- set scheduling parameters for domain
- cpu_bvtslice [slice] -- default scheduler slice
+ -- set BVT scheduling parameters for domain
+ cpu_bvtslice [slice] -- set default BVT scheduler slice
+ cpu_atropos_set [dom] [period] [slice] [latency] [xtratime]
+ -- set Atropos scheduling parameters for domain
+ cpu_rrobin_slice [slice] -- set Round Robin scheduler slice
vif_stats [dom] [vif] -- get stats for a given network vif
vif_addip [dom] [vif] [ip] -- add an IP address to a given vif
vif_setsched [dom] [vif] [bytes] [usecs] -- rate limit vif bandwidth
@@ -263,6 +266,17 @@ elif cmd == 'vbd_remove':
print "Failed"
sys.exit(1)
+elif cmd == 'cpu_atropos_set': # args: dom period slice latency xtratime
+ if len(sys.argv) < 6:
+ usage()
+ sys.exit(1)
+
+ (period, slice, latency, xtratime) = map(lambda x: int(x), sys.argv[3:7])
+
+ rc = xc.atropos_domain_set(dom, period, slice, latency, xtratime)
+
+elif cmd == 'cpu_rrobin_slice':
+ rc = xc.rrobin_global_set(slice=int(sys.argv[2]))
else:
usage()
diff --git a/tools/xc/lib/xc_atropos.c b/tools/xc/lib/xc_atropos.c
index 3b4535d96d..279324854c 100644
--- a/tools/xc/lib/xc_atropos.c
+++ b/tools/xc/lib/xc_atropos.c
@@ -20,8 +20,8 @@ int xc_atropos_domain_set(int xc_handle,
op.u.adjustdom.sched_id = SCHED_ATROPOS;
op.u.adjustdom.direction = SCHED_INFO_PUT;
- p->period = period;
- p->slice = slice;
+ p->nat_period = period;
+ p->nat_slice = slice;
p->latency = latency;
p->xtratime = xtratime;
@@ -42,8 +42,8 @@ int xc_atropos_domain_get(int xc_handle, u64 domid, u64 *period,
ret = do_dom0_op(xc_handle, &op);
- *period = p->period;
- *slice = p->slice;
+ *period = p->nat_period;
+ *slice = p->nat_slice;
*latency = p->latency;
*xtratime = p->xtratime;
diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c
index 3cbbe7efa6..d2be92d9cc 100644
--- a/tools/xc/py/Xc.c
+++ b/tools/xc/py/Xc.c
@@ -999,7 +999,7 @@ static PyObject *pyxc_rrobin_global_get(PyObject *self,
if ( xc_rrobin_global_get(xc->xc_handle, &slice) != 0 )
return PyErr_SetFromErrno(xc_error);
- return Py_BuildValue("s:L", "slice", slice);
+ return Py_BuildValue("{s:L}", "slice", slice);
}
@@ -1130,7 +1130,7 @@ static PyMethodDef pyxc_methods[] = {
" domain [long]: Domain ID.\n"
" mcuadv [long]: MCU Advance.\n"
" warp [long]: Warp.\n"
- " warpu [long]:\n"
+ " warpu [long]: Unwarp requirement.\n"
" warpl [long]: Warp limit,\n"
},
@@ -1363,7 +1363,7 @@ static PyObject *PyXc_new(PyObject *self, PyObject *args)
if ( (xc->xc_handle = xc_interface_open()) == -1 )
{
PyObject_Del((PyObject *)xc);
- return NULL;
+ return PyErr_SetFromErrno(xc_error);
}
return (PyObject *)xc;
diff --git a/xen/common/sched_atropos.c b/xen/common/sched_atropos.c
index 1a5fd792aa..8247aa4642 100644
--- a/xen/common/sched_atropos.c
+++ b/xen/common/sched_atropos.c
@@ -27,7 +27,7 @@
#define Activation_Reason_Preempted 2
#define Activation_Reason_Extra 3
-/* The following will be used for atropos-specific per-domain data fields */
+/* Atropos-specific per-domain data */
struct at_dom_info
{
/* MAW Xen additions */
@@ -37,18 +37,20 @@ struct at_dom_info
/* (what remains of) the original fields */
- s_time_t deadline; /* Next deadline */
- s_time_t prevddln; /* Previous deadline */
+ s_time_t deadline; /* Next deadline */
+ s_time_t prevddln; /* Previous deadline */
- s_time_t remain; /* Time remaining this period */
- s_time_t period; /* Period of time allocation */
- s_time_t slice; /* Length of allocation */
- s_time_t latency; /* Unblocking latency */
-
- int xtratime; /* Prepared to accept extra? */
+ s_time_t remain; /* Time remaining this period */
+ s_time_t period; /* Current period of time allocation */
+ s_time_t nat_period; /* Natural period */
+ s_time_t slice; /* Current length of allocation */
+ s_time_t nat_slice; /* Natural length of allocation */
+ s_time_t latency; /* Unblocking latency */
+
+ int xtratime; /* Prepared to accept extra time? */
};
-
+/* Atropos-specific per-CPU data */
struct at_cpu_info
{
struct list_head waitq; /* wait queue*/
@@ -62,9 +64,11 @@ struct at_cpu_info
#define BESTEFFORT_QUANTUM MILLISECS(5)
+
/* SLAB cache for struct at_dom_info objects */
static kmem_cache_t *dom_info_cache;
+
/** calculate the length of a linked list */
static int q_len(struct list_head *q)
{
@@ -163,17 +167,15 @@ static void at_add_task(struct task_struct *p)
DOM_INFO(p)->owner = p;
p->lastschd = now;
- if(is_idle_task(p))
- DOM_INFO(p)->slice = MILLISECS(5);
-
- /* DOM 0's scheduling parameters must be set here in order for it to boot
- * the system! */
+ /* DOM 0's parameters must be set here for it to boot the system! */
if(p->domain == 0)
{
DOM_INFO(p)->remain = MILLISECS(15);
- DOM_INFO(p)->period = MILLISECS(20);
- DOM_INFO(p)->slice = MILLISECS(15);
- DOM_INFO(p)->latency = MILLISECS(10);
+ DOM_INFO(p)->nat_period =
+ DOM_INFO(p)->period = MILLISECS(20);
+ DOM_INFO(p)->nat_slice =
+ DOM_INFO(p)->slice = MILLISECS(15);
+ DOM_INFO(p)->latency = MILLISECS(5);
DOM_INFO(p)->xtratime = 1;
DOM_INFO(p)->deadline = now;
DOM_INFO(p)->prevddln = now;
@@ -181,11 +183,13 @@ static void at_add_task(struct task_struct *p)
else /* other domains run basically best effort unless otherwise set */
{
DOM_INFO(p)->remain = 0;
- DOM_INFO(p)->period = MILLISECS(10000);
- DOM_INFO(p)->slice = MILLISECS(10);
- DOM_INFO(p)->latency = MILLISECS(10000);
+ DOM_INFO(p)->nat_period =
+ DOM_INFO(p)->period = SECONDS(10);
+ DOM_INFO(p)->nat_slice =
+ DOM_INFO(p)->slice = MILLISECS(10);
+ DOM_INFO(p)->latency = SECONDS(10);
DOM_INFO(p)->xtratime = 1;
- DOM_INFO(p)->deadline = now + MILLISECS(10000);
+ DOM_INFO(p)->deadline = now + SECONDS(10);
DOM_INFO(p)->prevddln = 0;
}
@@ -222,10 +226,19 @@ static void dequeue(struct task_struct *sdom)
* This function deals with updating the sdom for a domain
* which has just been unblocked.
*
- * ASSERT: On entry, the sdom has already been removed from the block
- * queue (it can be done more efficiently if we know that it
- * is on the head of the queue) but its deadline field has not been
- * restored yet.
+ * Xen's Atropos treats unblocking slightly differently to Nemesis:
+ *
+ * - "Short blocking" domains (i.e. that unblock before their deadline has
+ * expired) are treated the same as in nemesis (put on the wait queue and
+ * given preferential treatment in selecting domains for extra time).
+ *
+ * - "Long blocking" domains do not simply have their period truncated to their
+ * unblocking latency as before but also have their slice recomputed to be the
+ * same fraction of their new period. Each time the domain is scheduled, the
+ * period and slice are doubled until they reach their original ("natural")
+ * values, as set by the user (and stored in nat_period and nat_slice). The
+ * idea is to give better response times to unblocking whilst preserving QoS
+ * guarantees to other domains.
*/
static void unblock(struct task_struct *sdom)
{
@@ -235,18 +248,27 @@ static void unblock(struct task_struct *sdom)
dequeue(sdom);
/* We distinguish two cases... short and long blocks */
- if ( inf->deadline < time ) {
+
+ if ( inf->deadline < time )
+ {
+ /* Long blocking case */
+
/* The sdom has passed its deadline since it was blocked.
Give it its new deadline based on the latency value. */
- inf->prevddln = time;
+ inf->prevddln = time;
+
+ /* Scale the scheduling parameters as requested by the latency hint. */
inf->deadline = time + inf->latency;
- inf->remain = inf->slice;
- if(inf->remain > 0)
- sdom->state = TASK_RUNNING;
- else
- sdom->state = ATROPOS_TASK_WAIT;
-
- } else {
+ inf->slice = inf->nat_slice / ( inf->nat_period / inf->latency );
+ inf->period = inf->latency;
+ inf->remain = inf->slice;
+
+ sdom->state = TASK_RUNNING;
+ }
+ else
+ {
+ /* Short blocking case */
+
/* We leave REMAIN intact, but put this domain on the WAIT
queue marked as recently unblocked. It will be given
priority over other domains on the wait queue until while
@@ -284,9 +306,8 @@ task_slice_t ksched_scheduler(s_time_t time)
/* If we were spinning in the idle loop, there is no current
* domain to deschedule. */
- if (is_idle_task(cur_sdom)) {
+ if (is_idle_task(cur_sdom))
goto deschedule_done;
- }
/*****************************
*
@@ -304,7 +325,8 @@ task_slice_t ksched_scheduler(s_time_t time)
dequeue(cur_sdom);
if ((cur_sdom->state == TASK_RUNNING) ||
- (cur_sdom->state == ATROPOS_TASK_UNBLOCKED)) {
+ (cur_sdom->state == ATROPOS_TASK_UNBLOCKED))
+ {
/* In this block, we are doing accounting for an sdom which has
been running in contracted time. Note that this could now happen
@@ -314,10 +336,11 @@ task_slice_t ksched_scheduler(s_time_t time)
cur_info->remain -= ranfor;
/* If guaranteed time has run out... */
- if ( cur_info->remain <= 0 ) {
+ if ( cur_info->remain <= 0 )
+ {
/* Move domain to correct position in WAIT queue */
/* XXX sdom_unblocked doesn't need this since it is
- already in the correct place. */
+ already in the correct place. */
cur_sdom->state = ATROPOS_TASK_WAIT;
}
}
@@ -347,6 +370,20 @@ task_slice_t ksched_scheduler(s_time_t time)
dequeue(sdom);
+ if ( inf->period != inf->nat_period )
+ {
+ /* This domain has had its parameters adjusted as a result of
+ * unblocking and they need to be adjusted before requeuing it */
+ inf->slice *= 2;
+ inf->period *= 2;
+
+ if ( inf->period > inf->nat_period )
+ {
+ inf->period = inf->nat_period;
+ inf->slice = inf->nat_slice;
+ }
+ }
+
/* Domain begins a new period and receives a slice of CPU
* If this domain has been blocking then throw away the
* rest of it's remain - it can't be trusted */
@@ -354,8 +391,10 @@ task_slice_t ksched_scheduler(s_time_t time)
inf->remain = inf->slice;
else
inf->remain += inf->slice;
+
inf->prevddln = inf->deadline;
inf->deadline += inf->period;
+
if(inf->remain > 0)
sdom->state = TASK_RUNNING;
else
@@ -387,8 +426,8 @@ task_slice_t ksched_scheduler(s_time_t time)
/* MAW - the idle domain is always on the run queue. We run from the
* runqueue if it's NOT the idle domain or if there's nothing on the wait
* queue */
- if (cur_sdom->domain == IDLE_DOMAIN_ID && !list_empty(WAITQ(cpu))) {
-
+ if (cur_sdom->domain == IDLE_DOMAIN_ID && !list_empty(WAITQ(cpu)))
+ {
struct list_head *item;
/* Try running a domain on the WAIT queue - this part of the
@@ -422,24 +461,23 @@ task_slice_t ksched_scheduler(s_time_t time)
flag set. The NEXT_OPTM field is used to cheaply achieve
an approximation of round-robin order */
list_for_each(item, WAITQ(cpu))
- {
- struct at_dom_info *inf =
- list_entry(item, struct at_dom_info, waitq);
-
- sdom = inf->owner;
-
- if (inf->xtratime && i >= waitq_rrobin) {
- cur_sdom = sdom;
- cur_info = inf;
- newtime = time + BESTEFFORT_QUANTUM;
- reason = Activation_Reason_Extra;
- waitq_rrobin = i + 1; /* set this value ready for next */
- goto found;
- }
-
- i++;
+ {
+ struct at_dom_info *inf =
+ list_entry(item, struct at_dom_info, waitq);
+
+ sdom = inf->owner;
+
+ if (inf->xtratime && i >= waitq_rrobin) {
+ cur_sdom = sdom;
+ cur_info = inf;
+ newtime = time + BESTEFFORT_QUANTUM;
+ reason = Activation_Reason_Extra;
+ waitq_rrobin = i + 1; /* set this value ready for next */
+ goto found;
}
-
+
+ i++;
+ }
}
found:
@@ -519,15 +557,21 @@ static int at_adjdom(struct task_struct *p, struct sched_adjdom_cmd *cmd)
{
if ( cmd->direction == SCHED_INFO_PUT )
{
- DOM_INFO(p)->period = cmd->u.atropos.period;
- DOM_INFO(p)->slice = cmd->u.atropos.slice;
+ /* sanity checking! */
+ if( cmd->u.atropos.latency > cmd->u.atropos.nat_period
+ || cmd->u.atropos.latency == 0
+ || cmd->u.atropos.nat_slice > cmd->u.atropos.nat_period )
+ return -EINVAL;
+
+ DOM_INFO(p)->nat_period = cmd->u.atropos.nat_period;
+ DOM_INFO(p)->nat_slice = cmd->u.atropos.nat_slice;
DOM_INFO(p)->latency = cmd->u.atropos.latency;
DOM_INFO(p)->xtratime = !!cmd->u.atropos.xtratime;
}
else if ( cmd->direction == SCHED_INFO_GET )
{
- cmd->u.atropos.period = DOM_INFO(p)->period;
- cmd->u.atropos.slice = DOM_INFO(p)->slice;
+ cmd->u.atropos.nat_period = DOM_INFO(p)->nat_period;
+ cmd->u.atropos.nat_slice = DOM_INFO(p)->nat_slice;
cmd->u.atropos.latency = DOM_INFO(p)->latency;
cmd->u.atropos.xtratime = DOM_INFO(p)->xtratime;
}
@@ -544,9 +588,6 @@ static int at_alloc_task(struct task_struct *p)
if( (DOM_INFO(p) = kmem_cache_alloc(dom_info_cache, GFP_KERNEL)) == NULL )
return -1;
- if(p->domain == IDLE_DOMAIN_ID)
- printk("ALLOC IDLE ON CPU %d\n", p->processor);
-
memset(DOM_INFO(p), 0, sizeof(struct at_dom_info));
return 0;
@@ -559,6 +600,7 @@ static void at_free_task(struct task_struct *p)
kmem_cache_free( dom_info_cache, DOM_INFO(p) );
}
+
/* print decoded domain private state value (if known) */
static int at_prn_state(int state)
{
diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h
index 251f4853a4..905149f1d9 100644
--- a/xen/include/hypervisor-ifs/dom0_ops.h
+++ b/xen/include/hypervisor-ifs/dom0_ops.h
@@ -18,7 +18,7 @@
* This makes sure that old versions of dom0 tools will stop working in a
* well-defined way (rather than crashing the machine, for instance).
*/
-#define DOM0_INTERFACE_VERSION 0xAAAA000B
+#define DOM0_INTERFACE_VERSION 0xAAAA000C
#define MAX_CMD_LEN 256
#define MAX_DOMAIN_NAME 16
diff --git a/xen/include/hypervisor-ifs/sched_ctl.h b/xen/include/hypervisor-ifs/sched_ctl.h
index a2e57c2b7c..cdf682963a 100644
--- a/xen/include/hypervisor-ifs/sched_ctl.h
+++ b/xen/include/hypervisor-ifs/sched_ctl.h
@@ -60,8 +60,8 @@ struct sched_adjdom_cmd
struct atropos_adjdom
{
- u64 period;
- u64 slice;
+ u64 nat_period;
+ u64 nat_slice;
u64 latency;
int xtratime;
} atropos;