libxl: enable automatic placement of guests on NUMA nodes

If a domain does not have a VCPU affinity, try to pin it automatically to some PCPUs. This is done taking into account the NUMA characteristics of the host. In fact, we look for a combination of host's NUMA nodes with enough free memory and number of PCPUs for the new domain, and pin it to the VCPUs of those nodes. Deciding which placement is the best happens by means of some heuristics. For instance, smaller candidates are better, both from a domain perspective (less memory spreading among nodes) and from the entire system perspective (smaller memory fragmentation). In case of candidates of equal sizes (i.e., with the same number of nodes), the amount of free memory and the number of domains' vCPUs already pinned to the candidates' nodes are both considered. Very often, candidates with greater amount of memory are the one we wants, as this is good for keeping memory fragmentation under control. However, we do not want to overcommit some node too much, just because it has a lot of memory, and that's why the number of vCPUs must be accounted for. This all happens internally to libxl, and no API for driving the mechanism is provided for now. This matches what xend already does. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Ian Jackson <ian.jackson@eu.citrix.com> Tested-by: Andre Przywara <andre.przywara@amd.com> Committed-by: Ian Campbell <ian.campbell@citrix.com>
author: Dario Faggioli <dario.faggioli@citrix.com> 2012-07-26 15:41:54 +0100
committer: Dario Faggioli <dario.faggioli@citrix.com> 2012-07-26 15:41:54 +0100
commit: e98fee8390936477d369b9832ff9bb06594510c7 (patch)
tree: 485737b4c7bd3a9933296f3f52e6593f7b4f7570 /tools/libxl/libxl_dom.c
parent: 47ce0e3550a2d1ce8407e3643ceb595657ac4d79 (diff)
download: xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.gz
xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.bz2
xen-e98fee8390936477d369b9832ff9bb06594510c7.zip
1 files changed, 112 insertions, 0 deletions
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index bd4c9b448d..e13fb49f7f 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -98,6 +98,94 @@ out:
     return sched;
 }
 
+/*
+ * Two NUMA placement candidates are compared by means of the following
+ * heuristics:
+
+ *  - the number of vcpus runnable on the candidates is considered, and
+ *    candidates with fewer of them are preferred. If two candidate have
+ *    the same number of runnable vcpus,
+ *  - the amount of free memory in the candidates is considered, and the
+ *    candidate with greater amount of it is preferred.
+ *
+ * In fact, leaving larger memory holes, maximizes the probability of being
+ * able to put other domains on the node. That hopefully means many domains
+ * will benefit from local memory accesses, but also introduces the risk of
+ * overloading large (from a memory POV) nodes. That's right the effect
+ * that counting the vcpus able to run on the nodes tries to prevent.
+ *
+ * Note that this completely ignore the number of nodes each candidate span,
+ * as the fact that fewer nodes is better is already accounted for in the
+ * algorithm.
+ */
+static int numa_cmpf(const libxl__numa_candidate *c1,
+                     const libxl__numa_candidate *c2)
+{
+    if (c1->nr_vcpus != c2->nr_vcpus)
+        return c1->nr_vcpus - c2->nr_vcpus;
+
+    return c2->free_memkb - c1->free_memkb;
+}
+
+/* The actual automatic NUMA placement routine */
+static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info)
+{
+    int found;
+    libxl__numa_candidate candidate;
+    libxl_bitmap candidate_nodemap;
+    libxl_cpupoolinfo *pinfo;
+    int nr_pools, rc = 0;
+    uint32_t memkb;
+
+    libxl__numa_candidate_init(&candidate);
+    libxl_bitmap_init(&candidate_nodemap);
+
+    /* First of all, if cpupools are in use, better not to mess with them */
+    pinfo = libxl_list_cpupool(CTX, &nr_pools);
+    if (!pinfo)
+        return ERROR_FAIL;
+    if (nr_pools > 1) {
+        LOG(NOTICE, "Skipping NUMA placement as cpupools are in use");
+        goto out;
+    }
+
+    rc = libxl_domain_need_memory(CTX, info, &memkb);
+    if (rc)
+        goto out;
+    if (libxl_node_bitmap_alloc(CTX, &candidate_nodemap, 0)) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    /* Find the best candidate with enough free memory and at least
+     * as much pcpus as the domain has vcpus.  */
+    rc = libxl__get_numa_candidate(gc, memkb, info->max_vcpus, 0, 0,
+                                   numa_cmpf, &candidate, &found);
+    if (rc)
+        goto out;
+
+    /* Not even a suitable placement candidate! Let's just don't touch the
+     * domain's info->cpumap. It will have affinity with all nodes/cpus. */
+    if (found == 0)
+        goto out;
+
+    /* Map the candidate's node map to the domain's info->cpumap */
+    libxl__numa_candidate_get_nodemap(gc, &candidate, &candidate_nodemap);
+    rc = libxl_nodemap_to_cpumap(CTX, &candidate_nodemap, &info->cpumap);
+    if (rc)
+        goto out;
+
+    LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and "
+                "%"PRIu32" KB free selected", candidate.nr_nodes,
+                candidate.nr_cpus, candidate.free_memkb / 1024);
+
+ out:
+    libxl__numa_candidate_dispose(&candidate);
+    libxl_bitmap_dispose(&candidate_nodemap);
+    libxl_cpupoolinfo_list_free(pinfo, nr_pools);
+    return rc;
+}
+
 int libxl__build_pre(libxl__gc *gc, uint32_t domid,
               libxl_domain_build_info *info, libxl__domain_build_state *state)
 {
@@ -107,7 +195,31 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
     uint32_t rtc_timeoffset;
 
     xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus);
+
+    /*
+     * Check if the domain has any CPU affinity. If not, try to build
+     * up one. In case numa_place_domain() find at least a suitable
+     * candidate, it will affect info->cpumap accordingly; if it
+     * does not, it just leaves it as it is. This means (unless
+     * some weird error manifests) the subsequent call to
+     * libxl_set_vcpuaffinity_all() will do the actual placement,
+     * whatever that turns out to be.
+     */
+    if (libxl_defbool_val(info->numa_placement)) {
+        int rc;
+
+        if (!libxl_bitmap_is_full(&info->cpumap)) {
+            LOG(ERROR, "Can run NUMA placement only if no vcpu "
+                       "affinity is specified");
+            return ERROR_INVAL;
+        }
+
+        rc = numa_place_domain(gc, info);
+        if (rc)
+            return rc;
+    }
     libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap);
+
     xc_domain_setmaxmem(ctx->xch, domid, info->target_memkb + LIBXL_MAXMEM_CONSTANT);
     if (info->type == LIBXL_DOMAIN_TYPE_PV)
         xc_domain_set_memmap_limit(ctx->xch, domid,
author	Dario Faggioli <dario.faggioli@citrix.com>	2012-07-26 15:41:54 +0100
committer	Dario Faggioli <dario.faggioli@citrix.com>	2012-07-26 15:41:54 +0100
commit	e98fee8390936477d369b9832ff9bb06594510c7 (patch)
tree	485737b4c7bd3a9933296f3f52e6593f7b4f7570 /tools/libxl/libxl_dom.c
parent	47ce0e3550a2d1ce8407e3643ceb595657ac4d79 (diff)
download	xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.gz xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.bz2 xen-e98fee8390936477d369b9832ff9bb06594510c7.zip