diff options
author | mwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net> | 2004-04-19 14:52:17 +0000 |
---|---|---|
committer | mwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net> | 2004-04-19 14:52:17 +0000 |
commit | 49073372e47d8ed329bfe7b7542880ddbd8c2c59 (patch) | |
tree | f0bd00b43801d2d44cbbe897a24dfe964562aaca | |
parent | 6294a3f8ca8be94d5dbfa2102289d7c9acc12c00 (diff) | |
download | xen-49073372e47d8ed329bfe7b7542880ddbd8c2c59.tar.gz xen-49073372e47d8ed329bfe7b7542880ddbd8c2c59.tar.bz2 xen-49073372e47d8ed329bfe7b7542880ddbd8c2c59.zip |
bitkeeper revision 1.825.9.1 (4083e7a1nl5KCSLH_RLGt0IXz-sw8g)
Various scheduler updates. Main points: modify Atropos unblocking behaviour,
add further documentation, add support for Atropos and Round-robin to
xc_dom_control.py
-rw-r--r-- | .rootkeys | 1 | ||||
-rw-r--r-- | docs/Sched-HOWTO.txt | 135 | ||||
-rw-r--r-- | docs/Xen-HOWTO.txt | 5 | ||||
-rw-r--r-- | docs/interface.tex | 79 | ||||
-rw-r--r-- | tools/examples/README | 22 | ||||
-rwxr-xr-x | tools/examples/xc_dom_control.py | 18 | ||||
-rw-r--r-- | tools/xc/lib/xc_atropos.c | 8 | ||||
-rw-r--r-- | tools/xc/py/Xc.c | 6 | ||||
-rw-r--r-- | xen/common/sched_atropos.c | 174 | ||||
-rw-r--r-- | xen/include/hypervisor-ifs/dom0_ops.h | 2 | ||||
-rw-r--r-- | xen/include/hypervisor-ifs/sched_ctl.h | 4 |
11 files changed, 326 insertions, 128 deletions
@@ -8,6 +8,7 @@ 3f69d8abYB1vMyD_QVDvzxy5Zscf1A TODO 405ef604hIZH5pGi2uwlrlSvUMrutw docs/Console-HOWTO.txt 3f9e7d53iC47UnlfORp9iC1vai6kWw docs/Makefile +4083e798FbE1MIsQaIYvjnx1uvFhBg docs/Sched-HOWTO.txt 40083bb4LVQzRqA3ABz0__pPhGNwtA docs/VBD-HOWTO.txt 4021053fmeFrEyPHcT8JFiDpLNgtHQ docs/Xen-HOWTO.txt 3f9e7d60PWZJeVh5xdnk0nLUdxlqEA docs/eps/xenlogo.eps diff --git a/docs/Sched-HOWTO.txt b/docs/Sched-HOWTO.txt new file mode 100644 index 0000000000..42d6238f17 --- /dev/null +++ b/docs/Sched-HOWTO.txt @@ -0,0 +1,135 @@ +Xen Scheduler HOWTO +=================== + +by Mark Williamson +(c) 2004 Intel Research Cambridge + + +Introduction +------------ + +Xen offers a choice of CPU schedulers. All available schedulers are +included in Xen at compile time and the administrator may select a +particular scheduler using a boot-time parameter to Xen. It is +expected that administrators will choose the scheduler most +appropriate to their application and configure the machine to boot +with that scheduler. + +Note: the default scheduler is the Borrowed Virtual Time (BVT) +scheduler which was also used in previous releases of Xen. No +configuration changes are required to keep using this scheduler. + +This file provides a brief description of the CPU schedulers available +in Xen, what they are useful for and the parameters that are used to +configure them. This information is necessarily fairly technical at +the moment. The recommended way to fully understand the scheduling +algorithms is to read the relevant research papers. + +The interface to the schedulers is basically "raw" at the moment, +without sanity checking - administrators should be careful when +setting the parameters since it is possible for a mistake to hang +domains, or the entire system (in particular, double check parameters +for sanity and make sure that DOM0 will get enough CPU time to remain +usable). Note that xc_dom_control.py takes time values in +nanoseconds. + +Future tools will implement friendlier control interfaces. + + +Borrowed Virtual Time (BVT) +--------------------------- + +All releases of Xen have featured the BVT scheduler, which is used to +provide proportional fair shares of the CPU based on weights assigned +to domains. BVT is "work conserving" - the CPU will never be left +idle if there are runnable tasks. + +BVT uses "virtual time" to make decisions on which domain should be +scheduled on the processor. Each time a scheduling decision is +required, BVT evaluates the "Effective Virtual Time" of all domains +and then schedules the domain with the least EVT. Domains are allowed +to "borrow" virtual time by "time warping", which reduces their EVT by +a certain amount, so that they may be scheduled sooner. In order to +maintain long term fairness, there are limits on when a domain can +time warp and for how long. [ For more details read the SOSP'99 paper +by Duda and Cheriton ] + +In the Xen implementation, domains time warp when they unblock, so +that domain wakeup latencies are reduced. + +The BVT algorithm uses the following per-domain parameters (set using +xc_dom_control.py cpu_bvtset): + +* mcuadv - the MCU (Minimum Charging Unit) advance determines the + proportional share of the CPU that a domain receives. It + is set inversely proportionally to a domain's sharing weight. +* warp - the amount of "virtual time" the domain is allowed to warp + backwards +* warpl - the warp limit is the maximum time a domain can run warped for +* warpu - the unwarp requirement is the minimum time a domain must + run unwarped for before it can warp again + +BVT also has the following global parameter (set using +xc_dom_control.py cpu_bvtslice): + +* ctx_allow - the context switch allowance is similar to the "quantum" + in traditional schedulers. It is the minimum time that + a scheduled domain will be allowed to run before be + pre-empted. This prevents thrashing of the CPU. + +BVT can now be selected by passing the 'sched=bvt' argument to Xen at +boot-time and is the default scheduler if no 'sched' argument is +supplied. + +Atropos +------- + +Atropos is a scheduler originally developed for the Nemesis multimedia +operating system. Atropos can be used to reserve absolute shares of +the CPU. It also includes some features to improve the efficiency of +domains that block for I/O and to allow spare CPU time to be shared +out. + +The Atropos algorithm has the following parameters for each domain +(set using xc_dom_control.py cpu_atropos_set): + + * slice - The length of time per period that a domain is guaranteed. + * period - The period over which a domain is guaranteed to receive + its slice of CPU time. + * latency - The latency hint is used to control how soon after + waking up a domain should be scheduled. + * xtratime - This is a true (1) / false (0) flag that specifies whether + a domain should be allowed a share of the system slack time. + +Every domain has an associated period and slice. The domain should +receive 'slice' nanoseconds every 'period' nanoseconds. This allows +the administrator to configure both the absolute share of the CPU a +domain receives and the frequency with which it is scheduled. When +domains unblock, their period is reduced to the value of the latency +hint (the slice is scaled accordingly so that they still get the same +proportion of the CPU). For each subsequent period, the slice and +period times are doubled until they reach their original values. + +Atropos is selected by adding 'sched=atropos' to Xen's boot-time +arguments. + +Note: don't overcommit the CPU when using Atropos (i.e. don't reserve +more CPU than is available - the utilisation should be kept to +slightly less than 100% in order to ensure predictable behaviour). + +Round-Robin +----------- + +The Round-Robin scheduler is provided as a simple example of Xen's +internal scheduler API. For production systems, one of the other +schedulers should be used, since they are more flexible and more +efficient. + +The Round-robin scheduler has one global parameter (set using +xc_dom_control.py cpu_rrobin_slice): + + * rr_slice - The time for which each domain runs before the next + scheduling decision is made. + +The Round-Robin scheduler can be selected by adding 'sched=rrobin' to +Xen's boot-time arguments. diff --git a/docs/Xen-HOWTO.txt b/docs/Xen-HOWTO.txt index 3665a2ce79..af88c12a62 100644 --- a/docs/Xen-HOWTO.txt +++ b/docs/Xen-HOWTO.txt @@ -231,6 +231,11 @@ The following is a list of command line arguments to pass to Xen: enabled in debug builds. Most users can ignore this feature completely. + sched=xxx Select the CPU scheduler Xen should use. The current + possibilities are 'bvt', 'atropos' and 'rrobin'. The + default is 'bvt'. For more information see + Sched-HOWTO.txt. + Boot into Domain 0 ============================== diff --git a/docs/interface.tex b/docs/interface.tex index 2df158fff8..46da27626a 100644 --- a/docs/interface.tex +++ b/docs/interface.tex @@ -392,26 +392,14 @@ assigned domains should be run there. \section{Standard Schedulers} -These BVT and Round Robin schedulers are part of the normal Xen -distribution. A port of the Atropos scheduler from the Nemesis -operating system is almost complete and will be added shortly. +These BVT, Atropos and Round Robin schedulers are part of the normal +Xen distribution. BVT provides porportional fair shares of the CPU to +the running domains. Atropos can be used to reserve absolute shares +of the CPU for each domain. Round-robin is provided as an example of +Xen's internal scheduler API. -\subsection{Borrowed Virtual Time (BVT)} - -This was the original Xen scheduler. BVT is designed for general-purpose -environments but also provides support for latency-sensitive threads. It -provides long-term weighted sharing but allows tasks a limited ability to -``warp back'' in virtual time so that they are dispatched earlier. - -BVT can be activated by specifying {\tt sched=bvt} as a boot argument to Xen. - -\subsection{Round Robin} - -The round robin scheduler is a very simple example of some of the basic parts -of the scheduler API. - -Round robin can be activated by specifying {\tt sched=rrobin} as a boot -argument to Xen. +More information on the characteristics and use of these schedulers is +available in { \tt Sched-HOWTO.txt }. \section{Scheduling API} @@ -419,9 +407,6 @@ The scheduling API is used by both the schedulers described above and should also be used by any new schedulers. It provides a generic interface and also implements much of the ``boilerplate'' code. -\paragraph*{Note:} the scheduler API is currently undergoing active development, -so there may be some changes to this API, although they are expected to be small. - Schedulers conforming to this API are described by the following structure: @@ -438,7 +423,7 @@ struct scheduler void (*free_task) (struct task_struct *); void (*rem_task) (struct task_struct *); void (*wake_up) (struct task_struct *); - long (*do_block) (struct task_struct *); + void (*do_block) (struct task_struct *); task_slice_t (*do_schedule) (s_time_t); int (*control) (struct sched_ctl_cmd *); int (*adjdom) (struct task_struct *, @@ -458,7 +443,7 @@ The fields of the above structure are described in more detail below. \subsubsection{name} -The name field is an arbitrary descriptive ASCII string. +The name field should point to a descriptive ASCII string. \subsubsection{opt\_name} @@ -486,22 +471,22 @@ selected. \paragraph*{Return values} -This should return negative on failure --- failure to initialise the scheduler -will cause an immediate panic. +This should return negative on failure --- this will cause an +immediate panic and the system will fail to boot. \subsubsection{alloc\_task} \paragraph*{Purpose} -This is called when a {\tt task\_struct} is allocated by the generic scheduler -layer. A particular scheduler implementation may use this method to allocate -per-task data for this task. It may use the {\tt sched\_priv} pointer in the -{\tt task\_struct} to point to this data. +Called when a {\tt task\_struct} is allocated by the generic scheduler +layer. A particular scheduler implementation may use this method to +allocate per-task data for this task. It may use the {\tt +sched\_priv} pointer in the {\tt task\_struct} to point to this data. \paragraph*{Call environment} The generic layer guarantees that the {\tt sched\_priv} field will remain intact from the time this method is called until the task is deallocated (so long as the scheduler implementation does not change -it!). +it explicitly!). \paragraph*{Return values} Negative on failure. @@ -536,7 +521,8 @@ this method is called. \paragraph*{Purpose} -This is called when a task is being removed from scheduling. +This is called when a task is being removed from scheduling (but is +not yet being freed). \subsubsection{wake\_up} @@ -547,8 +533,7 @@ Called when a task is woken up, this method should put the task on the runqueue \paragraph*{Call environment} -The generic layer guarantees that the task is already in state -RUNNING. +The task is already set to state RUNNING. \subsubsection{do\_block} @@ -560,7 +545,9 @@ not remove the task from the runqueue. \paragraph*{Call environment} The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to -TASK\_INTERRUPTIBLE on entry to this method. +TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt + do\_schedule} method will be made after this method returns, in +order to select the next task to run. \subsubsection{do\_schedule} @@ -570,7 +557,7 @@ This method must be implemented. The method is called each time a new task must be chosen for scheduling on the current CPU. The current time as passed as the single argument (the current -task can be found using the {\tt current} variable). +task can be found using the {\tt current} macro). This method should select the next task to run on this CPU and set it's minimum time to run as well as returning the data described below. @@ -585,7 +572,7 @@ which also performs all Xen-specific tasks and performs the actual task switch (unless the previous task has been chosen again). This method is called with the {\tt schedule\_lock} held for the current CPU -and with interrupts disabled. +and local interrupts interrupts disabled. \paragraph*{Return values} @@ -597,15 +584,16 @@ for (at maximum). \paragraph*{Purpose} This method is called for global scheduler control operations. It takes a -pointer to a {\tt struct sched\_ctl\_cmd}, from which it should select the -appropriate command data. +pointer to a {\tt struct sched\_ctl\_cmd}, which it should either +source data from or populate with data, depending on the value of the +{\tt direction} field. \paragraph*{Call environment} -The generic layer guarantees that when this method is called, the caller was -using the same control interface version and that the caller selected the -correct scheduler ID, hence the scheduler's implementation does not need to -sanity-check these parts of the call. +The generic layer guarantees that when this method is called, the +caller was using the caller selected the correct scheduler ID, hence +the scheduler's implementation does not need to sanity-check these +parts of the call. \paragraph*{Return values} @@ -617,7 +605,9 @@ should either be 0 or an appropriate errno value. \paragraph*{Purpose} This method is called to adjust the scheduling parameters of a particular -domain. +domain, or to query their current values. The function should check +the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in +order to determine which of these operations is being performed. \paragraph*{Call environment} @@ -681,6 +671,7 @@ This method should dump any private settings for the specified task. This function is called with interrupts disabled and the {\tt schedule\_lock} for the task's CPU held. + \chapter{Debugging} Xen provides tools for debugging both Xen and guest OSes. Currently, the diff --git a/tools/examples/README b/tools/examples/README index 565b5c5c76..117f1091ef 100644 --- a/tools/examples/README +++ b/tools/examples/README @@ -16,27 +16,31 @@ send it (preferably with a little summary to go in this file) to xc_dom_control.py - general tool for controling running domains Usage: xc_dom_control.py [command] <params> + stop [dom] -- pause a domain start [dom] -- un-pause a domain - shutdown [dom] -- request a domain to shutdown (can specify 'all') + shutdown [dom] [[-w]] -- request a domain to shutdown (can specify 'all') (optionally wait for complete shutdown) destroy [dom] -- immediately terminate a domain pincpu [dom] [cpu] -- pin a domain to the specified CPU suspend [dom] [file] -- write domain's memory to a file and terminate (resume by re-running xc_dom_create with -L option) - restore [file] -- resume a domain from a file + unwatch [dom] -- kill the auto-restart daemon for a domain list -- print info about all domains listvbds -- print info about all virtual block devs cpu_bvtset [dom] [mcuadv] [warp] [warpl] [warpu] - -- set scheduling parameters for domain - cpu_bvtslice [slice] -- default scheduler slice + -- set BVT scheduling parameters for domain + cpu_bvtslice [slice] -- set default BVT scheduler slice + cpu_atropos_set [dom] [period] [slice] [latency] [xtratime] + -- set Atropos scheduling parameters for domain + cpu_rrobin_slice [slice] -- set Round Robin scheduler slice vif_stats [dom] [vif] -- get stats for a given network vif vif_addip [dom] [vif] [ip] -- add an IP address to a given vif vif_setsched [dom] [vif] [bytes] [usecs] -- rate limit vif bandwidth vif_getsched [dom] [vif] -- print vif's scheduling parameters vbd_add [dom] [uname] [dev] [mode] -- make disk/partition uname available to domain as dev e.g. 'vbd_add 2 phy:sda3 hda1 w' - vbd_remove [dom] [dev] -- remove disk or partition attached as 'dev' + vbd_remove [dom] [dev] -- remove disk or partition attached as 'dev' xc_dom_create.py @@ -89,7 +93,7 @@ Args to override the kernel command line, which is concatenated from these: -xc_vd_tool +xc_vd_tool.py - tool for manipulating virtual disks Usage: xc_vd_tool command <params> @@ -126,3 +130,9 @@ This is a Sys-V init script for RedHat systems. On a RedHat system it should be possible to issue commands to this script using the "service" command and to configure if / when it is run automatically, using the "chkconfig" command. + +xend +This is a Sys-V init script for RedHat systems, which can be used to +start the Xen Daemon (xend) at boot time. + + - Usage: xend {start|stop|status|restart|reload} diff --git a/tools/examples/xc_dom_control.py b/tools/examples/xc_dom_control.py index 0ca2c97ac2..19a2e16046 100755 --- a/tools/examples/xc_dom_control.py +++ b/tools/examples/xc_dom_control.py @@ -21,8 +21,11 @@ Usage: %s [command] <params> list -- print info about all domains listvbds -- print info about all virtual block devs cpu_bvtset [dom] [mcuadv] [warp] [warpl] [warpu] - -- set scheduling parameters for domain - cpu_bvtslice [slice] -- default scheduler slice + -- set BVT scheduling parameters for domain + cpu_bvtslice [slice] -- set default BVT scheduler slice + cpu_atropos_set [dom] [period] [slice] [latency] [xtratime] + -- set Atropos scheduling parameters for domain + cpu_rrobin_slice [slice] -- set Round Robin scheduler slice vif_stats [dom] [vif] -- get stats for a given network vif vif_addip [dom] [vif] [ip] -- add an IP address to a given vif vif_setsched [dom] [vif] [bytes] [usecs] -- rate limit vif bandwidth @@ -263,6 +266,17 @@ elif cmd == 'vbd_remove': print "Failed" sys.exit(1) +elif cmd == 'cpu_atropos_set': # args: dom period slice latency xtratime + if len(sys.argv) < 6: + usage() + sys.exit(1) + + (period, slice, latency, xtratime) = map(lambda x: int(x), sys.argv[3:7]) + + rc = xc.atropos_domain_set(dom, period, slice, latency, xtratime) + +elif cmd == 'cpu_rrobin_slice': + rc = xc.rrobin_global_set(slice=int(sys.argv[2])) else: usage() diff --git a/tools/xc/lib/xc_atropos.c b/tools/xc/lib/xc_atropos.c index 3b4535d96d..279324854c 100644 --- a/tools/xc/lib/xc_atropos.c +++ b/tools/xc/lib/xc_atropos.c @@ -20,8 +20,8 @@ int xc_atropos_domain_set(int xc_handle, op.u.adjustdom.sched_id = SCHED_ATROPOS; op.u.adjustdom.direction = SCHED_INFO_PUT; - p->period = period; - p->slice = slice; + p->nat_period = period; + p->nat_slice = slice; p->latency = latency; p->xtratime = xtratime; @@ -42,8 +42,8 @@ int xc_atropos_domain_get(int xc_handle, u64 domid, u64 *period, ret = do_dom0_op(xc_handle, &op); - *period = p->period; - *slice = p->slice; + *period = p->nat_period; + *slice = p->nat_slice; *latency = p->latency; *xtratime = p->xtratime; diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c index 3cbbe7efa6..d2be92d9cc 100644 --- a/tools/xc/py/Xc.c +++ b/tools/xc/py/Xc.c @@ -999,7 +999,7 @@ static PyObject *pyxc_rrobin_global_get(PyObject *self, if ( xc_rrobin_global_get(xc->xc_handle, &slice) != 0 ) return PyErr_SetFromErrno(xc_error); - return Py_BuildValue("s:L", "slice", slice); + return Py_BuildValue("{s:L}", "slice", slice); } @@ -1130,7 +1130,7 @@ static PyMethodDef pyxc_methods[] = { " domain [long]: Domain ID.\n" " mcuadv [long]: MCU Advance.\n" " warp [long]: Warp.\n" - " warpu [long]:\n" + " warpu [long]: Unwarp requirement.\n" " warpl [long]: Warp limit,\n" }, @@ -1363,7 +1363,7 @@ static PyObject *PyXc_new(PyObject *self, PyObject *args) if ( (xc->xc_handle = xc_interface_open()) == -1 ) { PyObject_Del((PyObject *)xc); - return NULL; + return PyErr_SetFromErrno(xc_error); } return (PyObject *)xc; diff --git a/xen/common/sched_atropos.c b/xen/common/sched_atropos.c index 1a5fd792aa..8247aa4642 100644 --- a/xen/common/sched_atropos.c +++ b/xen/common/sched_atropos.c @@ -27,7 +27,7 @@ #define Activation_Reason_Preempted 2 #define Activation_Reason_Extra 3 -/* The following will be used for atropos-specific per-domain data fields */ +/* Atropos-specific per-domain data */ struct at_dom_info { /* MAW Xen additions */ @@ -37,18 +37,20 @@ struct at_dom_info /* (what remains of) the original fields */ - s_time_t deadline; /* Next deadline */ - s_time_t prevddln; /* Previous deadline */ + s_time_t deadline; /* Next deadline */ + s_time_t prevddln; /* Previous deadline */ - s_time_t remain; /* Time remaining this period */ - s_time_t period; /* Period of time allocation */ - s_time_t slice; /* Length of allocation */ - s_time_t latency; /* Unblocking latency */ - - int xtratime; /* Prepared to accept extra? */ + s_time_t remain; /* Time remaining this period */ + s_time_t period; /* Current period of time allocation */ + s_time_t nat_period; /* Natural period */ + s_time_t slice; /* Current length of allocation */ + s_time_t nat_slice; /* Natural length of allocation */ + s_time_t latency; /* Unblocking latency */ + + int xtratime; /* Prepared to accept extra time? */ }; - +/* Atropos-specific per-CPU data */ struct at_cpu_info { struct list_head waitq; /* wait queue*/ @@ -62,9 +64,11 @@ struct at_cpu_info #define BESTEFFORT_QUANTUM MILLISECS(5) + /* SLAB cache for struct at_dom_info objects */ static kmem_cache_t *dom_info_cache; + /** calculate the length of a linked list */ static int q_len(struct list_head *q) { @@ -163,17 +167,15 @@ static void at_add_task(struct task_struct *p) DOM_INFO(p)->owner = p; p->lastschd = now; - if(is_idle_task(p)) - DOM_INFO(p)->slice = MILLISECS(5); - - /* DOM 0's scheduling parameters must be set here in order for it to boot - * the system! */ + /* DOM 0's parameters must be set here for it to boot the system! */ if(p->domain == 0) { DOM_INFO(p)->remain = MILLISECS(15); - DOM_INFO(p)->period = MILLISECS(20); - DOM_INFO(p)->slice = MILLISECS(15); - DOM_INFO(p)->latency = MILLISECS(10); + DOM_INFO(p)->nat_period = + DOM_INFO(p)->period = MILLISECS(20); + DOM_INFO(p)->nat_slice = + DOM_INFO(p)->slice = MILLISECS(15); + DOM_INFO(p)->latency = MILLISECS(5); DOM_INFO(p)->xtratime = 1; DOM_INFO(p)->deadline = now; DOM_INFO(p)->prevddln = now; @@ -181,11 +183,13 @@ static void at_add_task(struct task_struct *p) else /* other domains run basically best effort unless otherwise set */ { DOM_INFO(p)->remain = 0; - DOM_INFO(p)->period = MILLISECS(10000); - DOM_INFO(p)->slice = MILLISECS(10); - DOM_INFO(p)->latency = MILLISECS(10000); + DOM_INFO(p)->nat_period = + DOM_INFO(p)->period = SECONDS(10); + DOM_INFO(p)->nat_slice = + DOM_INFO(p)->slice = MILLISECS(10); + DOM_INFO(p)->latency = SECONDS(10); DOM_INFO(p)->xtratime = 1; - DOM_INFO(p)->deadline = now + MILLISECS(10000); + DOM_INFO(p)->deadline = now + SECONDS(10); DOM_INFO(p)->prevddln = 0; } @@ -222,10 +226,19 @@ static void dequeue(struct task_struct *sdom) * This function deals with updating the sdom for a domain * which has just been unblocked. * - * ASSERT: On entry, the sdom has already been removed from the block - * queue (it can be done more efficiently if we know that it - * is on the head of the queue) but its deadline field has not been - * restored yet. + * Xen's Atropos treats unblocking slightly differently to Nemesis: + * + * - "Short blocking" domains (i.e. that unblock before their deadline has + * expired) are treated the same as in nemesis (put on the wait queue and + * given preferential treatment in selecting domains for extra time). + * + * - "Long blocking" domains do not simply have their period truncated to their + * unblocking latency as before but also have their slice recomputed to be the + * same fraction of their new period. Each time the domain is scheduled, the + * period and slice are doubled until they reach their original ("natural") + * values, as set by the user (and stored in nat_period and nat_slice). The + * idea is to give better response times to unblocking whilst preserving QoS + * guarantees to other domains. */ static void unblock(struct task_struct *sdom) { @@ -235,18 +248,27 @@ static void unblock(struct task_struct *sdom) dequeue(sdom); /* We distinguish two cases... short and long blocks */ - if ( inf->deadline < time ) { + + if ( inf->deadline < time ) + { + /* Long blocking case */ + /* The sdom has passed its deadline since it was blocked. Give it its new deadline based on the latency value. */ - inf->prevddln = time; + inf->prevddln = time; + + /* Scale the scheduling parameters as requested by the latency hint. */ inf->deadline = time + inf->latency; - inf->remain = inf->slice; - if(inf->remain > 0) - sdom->state = TASK_RUNNING; - else - sdom->state = ATROPOS_TASK_WAIT; - - } else { + inf->slice = inf->nat_slice / ( inf->nat_period / inf->latency ); + inf->period = inf->latency; + inf->remain = inf->slice; + + sdom->state = TASK_RUNNING; + } + else + { + /* Short blocking case */ + /* We leave REMAIN intact, but put this domain on the WAIT queue marked as recently unblocked. It will be given priority over other domains on the wait queue until while @@ -284,9 +306,8 @@ task_slice_t ksched_scheduler(s_time_t time) /* If we were spinning in the idle loop, there is no current * domain to deschedule. */ - if (is_idle_task(cur_sdom)) { + if (is_idle_task(cur_sdom)) goto deschedule_done; - } /***************************** * @@ -304,7 +325,8 @@ task_slice_t ksched_scheduler(s_time_t time) dequeue(cur_sdom); if ((cur_sdom->state == TASK_RUNNING) || - (cur_sdom->state == ATROPOS_TASK_UNBLOCKED)) { + (cur_sdom->state == ATROPOS_TASK_UNBLOCKED)) + { /* In this block, we are doing accounting for an sdom which has been running in contracted time. Note that this could now happen @@ -314,10 +336,11 @@ task_slice_t ksched_scheduler(s_time_t time) cur_info->remain -= ranfor; /* If guaranteed time has run out... */ - if ( cur_info->remain <= 0 ) { + if ( cur_info->remain <= 0 ) + { /* Move domain to correct position in WAIT queue */ /* XXX sdom_unblocked doesn't need this since it is - already in the correct place. */ + already in the correct place. */ cur_sdom->state = ATROPOS_TASK_WAIT; } } @@ -347,6 +370,20 @@ task_slice_t ksched_scheduler(s_time_t time) dequeue(sdom); + if ( inf->period != inf->nat_period ) + { + /* This domain has had its parameters adjusted as a result of + * unblocking and they need to be adjusted before requeuing it */ + inf->slice *= 2; + inf->period *= 2; + + if ( inf->period > inf->nat_period ) + { + inf->period = inf->nat_period; + inf->slice = inf->nat_slice; + } + } + /* Domain begins a new period and receives a slice of CPU * If this domain has been blocking then throw away the * rest of it's remain - it can't be trusted */ @@ -354,8 +391,10 @@ task_slice_t ksched_scheduler(s_time_t time) inf->remain = inf->slice; else inf->remain += inf->slice; + inf->prevddln = inf->deadline; inf->deadline += inf->period; + if(inf->remain > 0) sdom->state = TASK_RUNNING; else @@ -387,8 +426,8 @@ task_slice_t ksched_scheduler(s_time_t time) /* MAW - the idle domain is always on the run queue. We run from the * runqueue if it's NOT the idle domain or if there's nothing on the wait * queue */ - if (cur_sdom->domain == IDLE_DOMAIN_ID && !list_empty(WAITQ(cpu))) { - + if (cur_sdom->domain == IDLE_DOMAIN_ID && !list_empty(WAITQ(cpu))) + { struct list_head *item; /* Try running a domain on the WAIT queue - this part of the @@ -422,24 +461,23 @@ task_slice_t ksched_scheduler(s_time_t time) flag set. The NEXT_OPTM field is used to cheaply achieve an approximation of round-robin order */ list_for_each(item, WAITQ(cpu)) - { - struct at_dom_info *inf = - list_entry(item, struct at_dom_info, waitq); - - sdom = inf->owner; - - if (inf->xtratime && i >= waitq_rrobin) { - cur_sdom = sdom; - cur_info = inf; - newtime = time + BESTEFFORT_QUANTUM; - reason = Activation_Reason_Extra; - waitq_rrobin = i + 1; /* set this value ready for next */ - goto found; - } - - i++; + { + struct at_dom_info *inf = + list_entry(item, struct at_dom_info, waitq); + + sdom = inf->owner; + + if (inf->xtratime && i >= waitq_rrobin) { + cur_sdom = sdom; + cur_info = inf; + newtime = time + BESTEFFORT_QUANTUM; + reason = Activation_Reason_Extra; + waitq_rrobin = i + 1; /* set this value ready for next */ + goto found; } - + + i++; + } } found: @@ -519,15 +557,21 @@ static int at_adjdom(struct task_struct *p, struct sched_adjdom_cmd *cmd) { if ( cmd->direction == SCHED_INFO_PUT ) { - DOM_INFO(p)->period = cmd->u.atropos.period; - DOM_INFO(p)->slice = cmd->u.atropos.slice; + /* sanity checking! */ + if( cmd->u.atropos.latency > cmd->u.atropos.nat_period + || cmd->u.atropos.latency == 0 + || cmd->u.atropos.nat_slice > cmd->u.atropos.nat_period ) + return -EINVAL; + + DOM_INFO(p)->nat_period = cmd->u.atropos.nat_period; + DOM_INFO(p)->nat_slice = cmd->u.atropos.nat_slice; DOM_INFO(p)->latency = cmd->u.atropos.latency; DOM_INFO(p)->xtratime = !!cmd->u.atropos.xtratime; } else if ( cmd->direction == SCHED_INFO_GET ) { - cmd->u.atropos.period = DOM_INFO(p)->period; - cmd->u.atropos.slice = DOM_INFO(p)->slice; + cmd->u.atropos.nat_period = DOM_INFO(p)->nat_period; + cmd->u.atropos.nat_slice = DOM_INFO(p)->nat_slice; cmd->u.atropos.latency = DOM_INFO(p)->latency; cmd->u.atropos.xtratime = DOM_INFO(p)->xtratime; } @@ -544,9 +588,6 @@ static int at_alloc_task(struct task_struct *p) if( (DOM_INFO(p) = kmem_cache_alloc(dom_info_cache, GFP_KERNEL)) == NULL ) return -1; - if(p->domain == IDLE_DOMAIN_ID) - printk("ALLOC IDLE ON CPU %d\n", p->processor); - memset(DOM_INFO(p), 0, sizeof(struct at_dom_info)); return 0; @@ -559,6 +600,7 @@ static void at_free_task(struct task_struct *p) kmem_cache_free( dom_info_cache, DOM_INFO(p) ); } + /* print decoded domain private state value (if known) */ static int at_prn_state(int state) { diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h index 251f4853a4..905149f1d9 100644 --- a/xen/include/hypervisor-ifs/dom0_ops.h +++ b/xen/include/hypervisor-ifs/dom0_ops.h @@ -18,7 +18,7 @@ * This makes sure that old versions of dom0 tools will stop working in a * well-defined way (rather than crashing the machine, for instance). */ -#define DOM0_INTERFACE_VERSION 0xAAAA000B +#define DOM0_INTERFACE_VERSION 0xAAAA000C #define MAX_CMD_LEN 256 #define MAX_DOMAIN_NAME 16 diff --git a/xen/include/hypervisor-ifs/sched_ctl.h b/xen/include/hypervisor-ifs/sched_ctl.h index a2e57c2b7c..cdf682963a 100644 --- a/xen/include/hypervisor-ifs/sched_ctl.h +++ b/xen/include/hypervisor-ifs/sched_ctl.h @@ -60,8 +60,8 @@ struct sched_adjdom_cmd struct atropos_adjdom { - u64 period; - u64 slice; + u64 nat_period; + u64 nat_slice; u64 latency; int xtratime; } atropos; |