aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDario Faggioli <dario.faggioli@citrix.com>2012-07-26 15:41:54 +0100
committerDario Faggioli <dario.faggioli@citrix.com>2012-07-26 15:41:54 +0100
commitf5e3add94b9455922f3374e61acd60ffddb6b5de (patch)
tree549afc56133d1112497e523f715f9750da188d7e
parente98fee8390936477d369b9832ff9bb06594510c7 (diff)
downloadxen-f5e3add94b9455922f3374e61acd60ffddb6b5de.tar.gz
xen-f5e3add94b9455922f3374e61acd60ffddb6b5de.tar.bz2
xen-f5e3add94b9455922f3374e61acd60ffddb6b5de.zip
libxl: have NUMA placement deal with cpupools
In such a way that only the cpus belonging to the cpupool of the domain being placed are considered for the placement itself. This happens by filtering out all the nodes in which the cpupool has not any cpu from the placement candidates. After that ---as cpu pooling not necessarily happens at NUMA nodes boundaries--- we also make sure only the actual cpus that are part of the pool are considered when counting how much processors a placement candidate provides. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Ian Campbell <ian.campbell@citrix.com> Acked-by: Ian Jackson <ian.jackson@eu.citrix.com> Committed-by: Ian Campbell <ian.campbell@citrix.com>
-rw-r--r--tools/libxl/libxl_dom.c40
-rw-r--r--tools/libxl/libxl_internal.h5
-rw-r--r--tools/libxl/libxl_numa.c88
3 files changed, 95 insertions, 38 deletions
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index e13fb49f7f..d749983ce7 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -128,26 +128,30 @@ static int numa_cmpf(const libxl__numa_candidate *c1,
}
/* The actual automatic NUMA placement routine */
-static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info)
+static int numa_place_domain(libxl__gc *gc, uint32_t domid,
+ libxl_domain_build_info *info)
{
int found;
libxl__numa_candidate candidate;
libxl_bitmap candidate_nodemap;
- libxl_cpupoolinfo *pinfo;
- int nr_pools, rc = 0;
+ libxl_cpupoolinfo cpupool_info;
+ int i, cpupool, rc = 0;
uint32_t memkb;
libxl__numa_candidate_init(&candidate);
libxl_bitmap_init(&candidate_nodemap);
- /* First of all, if cpupools are in use, better not to mess with them */
- pinfo = libxl_list_cpupool(CTX, &nr_pools);
- if (!pinfo)
- return ERROR_FAIL;
- if (nr_pools > 1) {
- LOG(NOTICE, "Skipping NUMA placement as cpupools are in use");
- goto out;
- }
+ /*
+ * Extract the cpumap from the cpupool the domain belong to. In fact,
+ * it only makes sense to consider the cpus/nodes that are in there
+ * for placement.
+ */
+ rc = cpupool = libxl__domain_cpupool(gc, domid);
+ if (rc < 0)
+ return rc;
+ rc = libxl_cpupool_info(CTX, &cpupool_info, cpupool);
+ if (rc)
+ return rc;
rc = libxl_domain_need_memory(CTX, info, &memkb);
if (rc)
@@ -159,7 +163,8 @@ static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info)
/* Find the best candidate with enough free memory and at least
* as much pcpus as the domain has vcpus. */
- rc = libxl__get_numa_candidate(gc, memkb, info->max_vcpus, 0, 0,
+ rc = libxl__get_numa_candidate(gc, memkb, info->max_vcpus,
+ 0, 0, &cpupool_info.cpumap,
numa_cmpf, &candidate, &found);
if (rc)
goto out;
@@ -175,6 +180,13 @@ static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info)
if (rc)
goto out;
+ /* Avoid trying to set the affinity to cpus that might be in the
+ * nodemap but not in our cpupool. */
+ libxl_for_each_set_bit(i, info->cpumap) {
+ if (!libxl_bitmap_test(&cpupool_info.cpumap, i))
+ libxl_bitmap_reset(&info->cpumap, i);
+ }
+
LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and "
"%"PRIu32" KB free selected", candidate.nr_nodes,
candidate.nr_cpus, candidate.free_memkb / 1024);
@@ -182,7 +194,7 @@ static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info)
out:
libxl__numa_candidate_dispose(&candidate);
libxl_bitmap_dispose(&candidate_nodemap);
- libxl_cpupoolinfo_list_free(pinfo, nr_pools);
+ libxl_cpupoolinfo_dispose(&cpupool_info);
return rc;
}
@@ -214,7 +226,7 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
return ERROR_INVAL;
}
- rc = numa_place_domain(gc, info);
+ rc = numa_place_domain(gc, domid, info);
if (rc)
return rc;
}
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index fd1b0cedfa..4938510ec0 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2568,6 +2568,10 @@ typedef int (*libxl__numa_candidate_cmpf)(const libxl__numa_candidate *c1,
* other hand, if not even one single candidate can be found, the function
* still returns successfully but cndt_found will be zero.
*
+ * Finally, suitable_cpumap is useful for telling that only the cpus in that
+ * mask should be considered when generating placement candidates (for
+ * example because of cpupools).
+ *
* It is up to the function to properly allocate cndt_out (by calling
* libxl__numa_candidate_alloc()), while it is the caller that should init
* (libxl__numa_candidate_init()) and free (libxl__numa_candidate_dispose())
@@ -2576,6 +2580,7 @@ typedef int (*libxl__numa_candidate_cmpf)(const libxl__numa_candidate *c1,
_hidden int libxl__get_numa_candidate(libxl__gc *gc,
uint32_t min_free_memkb, int min_cpus,
int min_nodes, int max_nodes,
+ const libxl_bitmap *suitable_cpumap,
libxl__numa_candidate_cmpf numa_cmpf,
libxl__numa_candidate *cndt_out,
int *cndt_found);
diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
index d95ed2af09..5301ec4f0c 100644
--- a/tools/libxl/libxl_numa.c
+++ b/tools/libxl/libxl_numa.c
@@ -105,30 +105,48 @@ static int comb_next(comb_iter_t it, int n, int k)
/* NUMA automatic placement (see libxl_internal.h for details) */
/*
- * This function turns a k-combination iterator into a node map.
- * This means the bits in the node map corresponding to the indexes
- * of the given combination are the ones that will be set.
- * For example, if the iterator represents the combination { 0, 2, 4},
- * the node map will have bits #0, #2 and #4 set.
+ * This function turns a k-combination iterator into a node map,
+ * given another map, telling us which nodes should be considered.
+ *
+ * This means the bits that are set in suitable_nodemap and that
+ * corresponds to the indexes of the given combination are the ones
+ * that will be set in nodemap.
+ *
+ * For example, given a fully set suitable_nodemap, if the iterator
+ * represents the combination { 0, 2, 4}, nodmeap will have bits #0,
+ * #2 and #4 set.
+ * On the other hand, if, say, suitable_nodemap=01011011, the same
+ * iterator will cause bits #1, #4 and #7 of nodemap to be set.
*/
-static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *nodemap, int k)
+static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *suitable_nodemap,
+ libxl_bitmap *nodemap, int k)
{
- int i;
+ int i, m = 0, n = 0;
libxl_bitmap_set_none(nodemap);
- for (i = 0; i < k; i++)
- libxl_bitmap_set(nodemap, it[i]);
+ libxl_for_each_set_bit(i, *suitable_nodemap) {
+ /* Check wether the n-th set bit of suitable_nodemap
+ * matches with the m-th element of the iterator (and,
+ * only if it does, advance to the next one) */
+ if (m < k && n == it[m]) {
+ libxl_bitmap_set(nodemap, i);
+ m++;
+ }
+ n++;
+ }
}
/* Retrieve the number of cpus that the nodes that are part of the nodemap
- * span. */
+ * span and are also set in suitable_cpumap. */
static int nodemap_to_nr_cpus(libxl_cputopology *tinfo, int nr_cpus,
+ const libxl_bitmap *suitable_cpumap,
const libxl_bitmap *nodemap)
{
int i, nodes_cpus = 0;
for (i = 0; i < nr_cpus; i++) {
- if (libxl_bitmap_test(nodemap, tinfo[i].node))
+ if (libxl_bitmap_test(suitable_cpumap, i) &&
+ libxl_bitmap_test(nodemap, tinfo[i].node))
nodes_cpus++;
}
return nodes_cpus;
@@ -242,6 +260,7 @@ static int count_cpus_per_node(libxl_cputopology *tinfo, int nr_cpus,
int libxl__get_numa_candidate(libxl__gc *gc,
uint32_t min_free_memkb, int min_cpus,
int min_nodes, int max_nodes,
+ const libxl_bitmap *suitable_cpumap,
libxl__numa_candidate_cmpf numa_cmpf,
libxl__numa_candidate *cndt_out,
int *cndt_found)
@@ -249,11 +268,12 @@ int libxl__get_numa_candidate(libxl__gc *gc,
libxl__numa_candidate new_cndt;
libxl_cputopology *tinfo = NULL;
libxl_numainfo *ninfo = NULL;
- int nr_nodes = 0, nr_cpus = 0;
- libxl_bitmap nodemap;
+ int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0;
+ libxl_bitmap suitable_nodemap, nodemap;
int rc = 0;
libxl_bitmap_init(&nodemap);
+ libxl_bitmap_init(&suitable_nodemap);
libxl__numa_candidate_init(&new_cndt);
/* Get platform info and prepare the map for testing the combinations */
@@ -300,6 +320,15 @@ int libxl__get_numa_candidate(libxl__gc *gc,
if (rc)
goto out;
+ /* Allocate and prepare the map of the node that can be utilized for
+ * placement, basing on the map of suitable cpus. */
+ rc = libxl_node_bitmap_alloc(CTX, &suitable_nodemap, 0);
+ if (rc)
+ goto out;
+ rc = libxl_cpumap_to_nodemap(CTX, suitable_cpumap, &suitable_nodemap);
+ if (rc)
+ goto out;
+
/*
* If the minimum number of NUMA nodes is not explicitly specified
* (i.e., min_nodes == 0), we try to figure out a sensible number of nodes
@@ -317,10 +346,14 @@ int libxl__get_numa_candidate(libxl__gc *gc,
else
min_nodes = (min_cpus + cpus_per_node - 1) / cpus_per_node;
}
- if (min_nodes > nr_nodes)
- min_nodes = nr_nodes;
- if (!max_nodes || max_nodes > nr_nodes)
- max_nodes = nr_nodes;
+ /* We also need to be sure we do not exceed the number of
+ * nodes we are allowed to use. */
+ nr_suit_nodes = libxl_bitmap_count_set(&suitable_nodemap);
+
+ if (min_nodes > nr_suit_nodes)
+ min_nodes = nr_suit_nodes;
+ if (!max_nodes || max_nodes > nr_suit_nodes)
+ max_nodes = nr_suit_nodes;
if (min_nodes > max_nodes) {
LOG(ERROR, "Inconsistent minimum or maximum number of guest nodes");
rc = ERROR_INVAL;
@@ -353,12 +386,16 @@ int libxl__get_numa_candidate(libxl__gc *gc,
* amount of free memory and number of cpus) and it can concur to
* become our best placement iff it passes the check.
*/
- for (comb_ok = comb_init(gc, &comb_iter, nr_nodes, min_nodes); comb_ok;
- comb_ok = comb_next(comb_iter, nr_nodes, min_nodes)) {
+ for (comb_ok = comb_init(gc, &comb_iter, nr_suit_nodes, min_nodes);
+ comb_ok;
+ comb_ok = comb_next(comb_iter, nr_suit_nodes, min_nodes)) {
uint32_t nodes_free_memkb;
int nodes_cpus;
- comb_get_nodemap(comb_iter, &nodemap, min_nodes);
+ /* Get the nodemap for the combination, only considering
+ * suitable nodes. */
+ comb_get_nodemap(comb_iter, &suitable_nodemap,
+ &nodemap, min_nodes);
/* If there is not enough memory in this combination, skip it
* and go generating the next one... */
@@ -367,7 +404,8 @@ int libxl__get_numa_candidate(libxl__gc *gc,
continue;
/* And the same applies if this combination is short in cpus */
- nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, &nodemap);
+ nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, suitable_cpumap,
+ &nodemap);
if (min_cpus && nodes_cpus < min_cpus)
continue;
@@ -378,7 +416,7 @@ int libxl__get_numa_candidate(libxl__gc *gc,
libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap);
new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap);
new_cndt.free_memkb = nodes_free_memkb;
- new_cndt.nr_nodes = min_nodes;
+ new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap);
new_cndt.nr_cpus = nodes_cpus;
/*
@@ -392,8 +430,9 @@ int libxl__get_numa_candidate(libxl__gc *gc,
LOG(DEBUG, "New best NUMA placement candidate found: "
"nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, "
- "free_memkb=%"PRIu32"", min_nodes, new_cndt.nr_cpus,
- new_cndt.nr_vcpus, new_cndt.free_memkb / 1024);
+ "free_memkb=%"PRIu32"", new_cndt.nr_nodes,
+ new_cndt.nr_cpus, new_cndt.nr_vcpus,
+ new_cndt.free_memkb / 1024);
libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap);
cndt_out->nr_vcpus = new_cndt.nr_vcpus;
@@ -413,6 +452,7 @@ int libxl__get_numa_candidate(libxl__gc *gc,
out:
libxl_bitmap_dispose(&nodemap);
+ libxl_bitmap_dispose(&suitable_nodemap);
libxl__numa_candidate_dispose(&new_cndt);
libxl_numainfo_list_free(ninfo, nr_nodes);
libxl_cputopology_list_free(tinfo, nr_cpus);