xen: allow for explicitly specifying node-affinity

Make it possible to pass the node-affinity of a domain to the hypervisor from the upper layers, instead of always being computed automatically. Note that this also required generalizing the Flask hooks for setting and getting the affinity, so that they now deal with both vcpu and node affinity. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: Keir Fraser <keir@xen.org>
author: Dario Faggioli <dario.faggioli@citrix.com> 2013-04-17 10:57:32 +0000
committer: Ian Campbell <ian.campbell@citrix.com> 2013-04-17 12:11:14 +0100
commit: b5b79a12c41b5e76af9d47551027b56f210d9029 (patch)
tree: 05125acfb1348b09f63830577687a55f73d0b324 /xen/common
parent: cfcc144ff1ce59a9f93a44bbc89d1e20f5011c3d (diff)
download: xen-b5b79a12c41b5e76af9d47551027b56f210d9029.tar.gz
xen-b5b79a12c41b5e76af9d47551027b56f210d9029.tar.bz2
xen-b5b79a12c41b5e76af9d47551027b56f210d9029.zip
5 files changed, 133 insertions, 8 deletions
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 590548e101..ce45d66b45 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -224,6 +224,7 @@ struct domain *domain_create(
 
     spin_lock_init(&d->node_affinity_lock);
     d->node_affinity = NODE_MASK_ALL;
+    d->auto_node_affinity = 1;
 
     spin_lock_init(&d->shutdown_lock);
     d->shutdown_code = -1;
@@ -364,11 +365,32 @@ void domain_update_node_affinity(struct domain *d)
         cpumask_or(cpumask, cpumask, online_affinity);
     }
 
-    for_each_online_node ( node )
-        if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
-            node_set(node, nodemask);
+    if ( d->auto_node_affinity )
+    {
+        /* Node-affinity is automaically computed from all vcpu-affinities */
+        for_each_online_node ( node )
+            if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
+                node_set(node, nodemask);
+
+        d->node_affinity = nodemask;
+    }
+    else
+    {
+        /* Node-affinity is provided by someone else, just filter out cpus
+         * that are either offline or not in the affinity of any vcpus. */
+        nodemask = d->node_affinity;
+        for_each_node_mask ( node, d->node_affinity )
+            if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) )
+                node_clear(node, nodemask);//d->node_affinity);
+
+        /* Avoid loosing track of node-affinity because of a bad
+         * vcpu-affinity has been specified. */
+        if ( !nodes_empty(nodemask) )
+            d->node_affinity = nodemask;
+    }
+
+    sched_set_node_affinity(d, &d->node_affinity);
 
-    d->node_affinity = nodemask;
     spin_unlock(&d->node_affinity_lock);
 
     free_cpumask_var(online_affinity);
@@ -376,6 +398,36 @@ void domain_update_node_affinity(struct domain *d)
 }
 
 
+int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity)
+{
+    /* Being affine with no nodes is just wrong */
+    if ( nodes_empty(*affinity) )
+        return -EINVAL;
+
+    spin_lock(&d->node_affinity_lock);
+
+    /*
+     * Being/becoming explicitly affine to all nodes is not particularly
+     * useful. Let's take it as the `reset node affinity` command.
+     */
+    if ( nodes_full(*affinity) )
+    {
+        d->auto_node_affinity = 1;
+        goto out;
+    }
+
+    d->auto_node_affinity = 0;
+    d->node_affinity = *affinity;
+
+out:
+    spin_unlock(&d->node_affinity_lock);
+
+    domain_update_node_affinity(d);
+
+    return 0;
+}
+
+
 struct domain *get_domain_by_id(domid_t dom)
 {
     struct domain *d;
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 37532c25c6..6bd8efdf06 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -560,6 +560,23 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
     }
     break;
 
+    case XEN_DOMCTL_setnodeaffinity:
+    {
+        nodemask_t new_affinity;
+
+        ret = xenctl_bitmap_to_nodemask(&new_affinity,
+                                        &op->u.nodeaffinity.nodemap);
+        if ( !ret )
+            ret = domain_set_node_affinity(d, &new_affinity);
+    }
+    break;
+    case XEN_DOMCTL_getnodeaffinity:
+    {
+        ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap,
+                                        &d->node_affinity);
+    }
+    break;
+
     case XEN_DOMCTL_setvcpuaffinity:
     case XEN_DOMCTL_getvcpuaffinity:
     {
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index e9ef45f462..507213360c 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -218,6 +218,14 @@ static void cpuset_print(char *set, int size, const cpumask_t *mask)
     *set++ = '\0';
 }
 
+static void nodeset_print(char *set, int size, const nodemask_t *mask)
+{
+    *set++ = '[';
+    set += nodelist_scnprintf(set, size-2, mask);
+    *set++ = ']';
+    *set++ = '\0';
+}
+
 static void periodic_timer_print(char *str, int size, uint64_t period)
 {
     if ( period == 0 )
@@ -273,6 +281,9 @@ static void dump_domains(unsigned char key)
 
         dump_pageframe_info(d);
                
+        nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity);
+        printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr);
+
         printk("VCPU information and callbacks for domain %u:\n",
                d->domain_id);
         for_each_vcpu ( d, v )
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index be0b4b0c6c..318831054d 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -261,17 +261,50 @@ __runq_remove(struct csched_vcpu *svc)
     list_del_init(&svc->runq_elem);
 }
 
+/*
+ * Translates node-affinity mask into a cpumask, so that we can use it during
+ * actual scheduling. That of course will contain all the cpus from all the
+ * set nodes in the original node-affinity mask.
+ *
+ * Note that any serialization needed to access mask safely is complete
+ * responsibility of the caller of this function/hook.
+ */
+static void csched_set_node_affinity(
+    const struct scheduler *ops,
+    struct domain *d,
+    nodemask_t *mask)
+{
+    struct csched_dom *sdom;
+    int node;
+
+    /* Skip idle domain since it doesn't even have a node_affinity_cpumask */
+    if ( unlikely(is_idle_domain(d)) )
+        return;
+
+    sdom = CSCHED_DOM(d);
+    cpumask_clear(sdom->node_affinity_cpumask);
+    for_each_node_mask( node, *mask )
+        cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask,
+                   &node_to_cpumask(node));
+}
+
 #define for_each_csched_balance_step(step) \
     for ( (step) = 0; (step) <= CSCHED_BALANCE_CPU_AFFINITY; (step)++ )
 
 
 /*
  * vcpu-affinity balancing is always necessary and must never be skipped.
- * OTOH, if a domain's node-affinity spans all the nodes, we can safely
- * avoid dealing with node-affinity entirely.
+ * OTOH, if a domain's node-affinity is said to be automatically computed
+ * (or if it just spans all the nodes), we can safely avoid dealing with
+ * node-affinity entirely. Ah, node-affinity is also deemed meaningless
+ * in case it has empty intersection with the vcpu's vcpu-affinity, as it
+ * would mean trying to schedule it on _no_ pcpu!
  */
-#define __vcpu_has_node_affinity(vc) \
-    ( !cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) )
+#define __vcpu_has_node_affinity(vc)                                          \
+    ( !(cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask)           \
+        || !cpumask_intersects(vc->cpu_affinity,                              \
+                               CSCHED_DOM(vc->domain)->node_affinity_cpumask) \
+        || vc->domain->auto_node_affinity == 1) )
 
 /*
  * Each csched-balance step uses its own cpumask. This function determines
@@ -284,8 +317,13 @@ static void
 csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask)
 {
     if ( step == CSCHED_BALANCE_NODE_AFFINITY )
+    {
         cpumask_and(mask, CSCHED_DOM(vc->domain)->node_affinity_cpumask,
                     vc->cpu_affinity);
+
+        if ( unlikely(cpumask_empty(mask)) )
+            cpumask_copy(mask, vc->cpu_affinity);
+    }
     else /* step == CSCHED_BALANCE_CPU_AFFINITY */
         cpumask_copy(mask, vc->cpu_affinity);
 }
@@ -1898,6 +1936,8 @@ const struct scheduler sched_credit_def = {
     .adjust         = csched_dom_cntl,
     .adjust_global  = csched_sys_cntl,
 
+    .set_node_affinity  = csched_set_node_affinity,
+
     .pick_cpu       = csched_cpu_pick,
     .do_schedule    = csched_schedule,
 
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 7364ff8a7d..c1cd3d0f15 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -638,6 +638,11 @@ int cpu_disable_scheduler(unsigned int cpu)
     return ret;
 }
 
+void sched_set_node_affinity(struct domain *d, nodemask_t *mask)
+{
+    SCHED_OP(DOM2OP(d), set_node_affinity, d, mask);
+}
+
 int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity)
 {
     cpumask_t online_affinity;
author	Dario Faggioli <dario.faggioli@citrix.com>	2013-04-17 10:57:32 +0000
committer	Ian Campbell <ian.campbell@citrix.com>	2013-04-17 12:11:14 +0100
commit	b5b79a12c41b5e76af9d47551027b56f210d9029 (patch)
tree	05125acfb1348b09f63830577687a55f73d0b324 /xen/common
parent	cfcc144ff1ce59a9f93a44bbc89d1e20f5011c3d (diff)
download	xen-b5b79a12c41b5e76af9d47551027b56f210d9029.tar.gz xen-b5b79a12c41b5e76af9d47551027b56f210d9029.tar.bz2 xen-b5b79a12c41b5e76af9d47551027b56f210d9029.zip