aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/man/xl.cfg.pod.510
-rw-r--r--tools/libxl/Makefile2
-rw-r--r--tools/libxl/libxl_create.c2
-rw-r--r--tools/libxl/libxl_dom.c112
-rw-r--r--tools/libxl/libxl_internal.h117
-rw-r--r--tools/libxl/libxl_numa.c428
-rw-r--r--tools/libxl/libxl_types.idl1
-rw-r--r--tools/libxl/xl_cmdimpl.c5
8 files changed, 674 insertions, 3 deletions
diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index 314a03e339..8aaff1fec9 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -111,8 +111,8 @@ created online and the remainder will be offline.
=item B<cpus="CPU-LIST">
-List of which cpus the guest is allowed to use. Default behavior is
-`all cpus`. A C<CPU-LIST> may be specified as follows:
+List of which cpus the guest is allowed to use. By default xl will pick
+some cpus on its own (see below). A C<CPU-LIST> may be specified as follows:
=over 4
@@ -132,6 +132,12 @@ run on cpu #3 of the host.
=back
+If this option is not specified, libxl automatically tries to place the new
+domain on the host's NUMA nodes (provided the host has more than one NUMA
+node) by pinning it to the cpus of those nodes. A heuristic approach is
+utilized with the goals of maximizing performance for the domain and, at
+the same time, achieving efficient utilization of the host's CPUs and RAM.
+
=item B<cpu_weight=WEIGHT>
A domain with a weight of 512 will get twice as much CPU as a domain
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index 48f352eb4e..424a7ee755 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -66,7 +66,7 @@ LIBXL_LIBS += -lyajl
LIBXL_OBJS = flexarray.o libxl.o libxl_create.o libxl_dm.o libxl_pci.o \
libxl_dom.o libxl_exec.o libxl_xshelp.o libxl_device.o \
libxl_internal.o libxl_utils.o libxl_uuid.o \
- libxl_json.o libxl_aoutils.o \
+ libxl_json.o libxl_aoutils.o libxl_numa.o \
libxl_save_callout.o _libxl_save_msgs_callout.o \
libxl_qmp.o libxl_event.o libxl_fork.o $(LIBXL_OBJS-y)
LIBXL_OBJS += _libxl_types.o libxl_flask.o _libxl_types_internal.o
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 0c81a60972..3c950bab9c 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -215,6 +215,8 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
libxl_bitmap_set_any(&b_info->cpumap);
}
+ libxl_defbool_setdefault(&b_info->numa_placement, true);
+
if (b_info->max_memkb == LIBXL_MEMKB_DEFAULT)
b_info->max_memkb = 32 * 1024;
if (b_info->target_memkb == LIBXL_MEMKB_DEFAULT)
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index bd4c9b448d..e13fb49f7f 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -98,6 +98,94 @@ out:
return sched;
}
+/*
+ * Two NUMA placement candidates are compared by means of the following
+ * heuristics:
+
+ * - the number of vcpus runnable on the candidates is considered, and
+ * candidates with fewer of them are preferred. If two candidate have
+ * the same number of runnable vcpus,
+ * - the amount of free memory in the candidates is considered, and the
+ * candidate with greater amount of it is preferred.
+ *
+ * In fact, leaving larger memory holes, maximizes the probability of being
+ * able to put other domains on the node. That hopefully means many domains
+ * will benefit from local memory accesses, but also introduces the risk of
+ * overloading large (from a memory POV) nodes. That's right the effect
+ * that counting the vcpus able to run on the nodes tries to prevent.
+ *
+ * Note that this completely ignore the number of nodes each candidate span,
+ * as the fact that fewer nodes is better is already accounted for in the
+ * algorithm.
+ */
+static int numa_cmpf(const libxl__numa_candidate *c1,
+ const libxl__numa_candidate *c2)
+{
+ if (c1->nr_vcpus != c2->nr_vcpus)
+ return c1->nr_vcpus - c2->nr_vcpus;
+
+ return c2->free_memkb - c1->free_memkb;
+}
+
+/* The actual automatic NUMA placement routine */
+static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info)
+{
+ int found;
+ libxl__numa_candidate candidate;
+ libxl_bitmap candidate_nodemap;
+ libxl_cpupoolinfo *pinfo;
+ int nr_pools, rc = 0;
+ uint32_t memkb;
+
+ libxl__numa_candidate_init(&candidate);
+ libxl_bitmap_init(&candidate_nodemap);
+
+ /* First of all, if cpupools are in use, better not to mess with them */
+ pinfo = libxl_list_cpupool(CTX, &nr_pools);
+ if (!pinfo)
+ return ERROR_FAIL;
+ if (nr_pools > 1) {
+ LOG(NOTICE, "Skipping NUMA placement as cpupools are in use");
+ goto out;
+ }
+
+ rc = libxl_domain_need_memory(CTX, info, &memkb);
+ if (rc)
+ goto out;
+ if (libxl_node_bitmap_alloc(CTX, &candidate_nodemap, 0)) {
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ /* Find the best candidate with enough free memory and at least
+ * as much pcpus as the domain has vcpus. */
+ rc = libxl__get_numa_candidate(gc, memkb, info->max_vcpus, 0, 0,
+ numa_cmpf, &candidate, &found);
+ if (rc)
+ goto out;
+
+ /* Not even a suitable placement candidate! Let's just don't touch the
+ * domain's info->cpumap. It will have affinity with all nodes/cpus. */
+ if (found == 0)
+ goto out;
+
+ /* Map the candidate's node map to the domain's info->cpumap */
+ libxl__numa_candidate_get_nodemap(gc, &candidate, &candidate_nodemap);
+ rc = libxl_nodemap_to_cpumap(CTX, &candidate_nodemap, &info->cpumap);
+ if (rc)
+ goto out;
+
+ LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and "
+ "%"PRIu32" KB free selected", candidate.nr_nodes,
+ candidate.nr_cpus, candidate.free_memkb / 1024);
+
+ out:
+ libxl__numa_candidate_dispose(&candidate);
+ libxl_bitmap_dispose(&candidate_nodemap);
+ libxl_cpupoolinfo_list_free(pinfo, nr_pools);
+ return rc;
+}
+
int libxl__build_pre(libxl__gc *gc, uint32_t domid,
libxl_domain_build_info *info, libxl__domain_build_state *state)
{
@@ -107,7 +195,31 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
uint32_t rtc_timeoffset;
xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus);
+
+ /*
+ * Check if the domain has any CPU affinity. If not, try to build
+ * up one. In case numa_place_domain() find at least a suitable
+ * candidate, it will affect info->cpumap accordingly; if it
+ * does not, it just leaves it as it is. This means (unless
+ * some weird error manifests) the subsequent call to
+ * libxl_set_vcpuaffinity_all() will do the actual placement,
+ * whatever that turns out to be.
+ */
+ if (libxl_defbool_val(info->numa_placement)) {
+ int rc;
+
+ if (!libxl_bitmap_is_full(&info->cpumap)) {
+ LOG(ERROR, "Can run NUMA placement only if no vcpu "
+ "affinity is specified");
+ return ERROR_INVAL;
+ }
+
+ rc = numa_place_domain(gc, info);
+ if (rc)
+ return rc;
+ }
libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap);
+
xc_domain_setmaxmem(ctx->xch, domid, info->target_memkb + LIBXL_MAXMEM_CONSTANT);
if (info->type == LIBXL_DOMAIN_TYPE_PV)
xc_domain_set_memmap_limit(ctx->xch, domid,
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 1b9b417664..fd1b0cedfa 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2497,6 +2497,123 @@ static inline void libxl__ctx_unlock(libxl_ctx *ctx) {
#define CTX_LOCK (libxl__ctx_lock(CTX))
#define CTX_UNLOCK (libxl__ctx_unlock(CTX))
+/*
+ * Automatic NUMA placement
+ *
+ * These functions and data structures deal with the initial placement of a
+ * domain onto the host NUMA nodes.
+ *
+ * The key concept here is the one of "NUMA placement candidate", which is
+ * basically a set of nodes whose characteristics have been successfully
+ * checked against some specific requirements. More precisely, a candidate
+ * is the nodemap associated with one of the possible subset of the host
+ * NUMA nodes providing a certain amount of free memory, or a given number
+ * of cpus, or even both (depending in what the caller wants). For
+ * convenience of use, some of this information are stored within the
+ * candidate itself, instead of always being dynamically computed. A single
+ * node can be valid placement candidate, as well as it is possible for a
+ * candidate to contain all the nodes of the host. The fewer nodes there
+ * are in a candidate, the better performance a domain placed onto it
+ * should get (at least from a NUMA point of view). For instance, looking
+ * for a numa candidates with 2GB of free memory means we want the subsets
+ * of the host NUMA nodes with, cumulatively, at least 2GB of free memory.
+ * This condition can be satisfied by just one particular node, or it may
+ * require more nodes, depending on the characteristics of the host, on how
+ * many domains have been created already, on how big they are, etc.
+ *
+ * The intended usage is as follows:
+ * 1. first of all, call libxl__get_numa_candidates(), and specify the
+ * proper constraints to it (e.g., the amount of memory a domain need
+ * as the minimum amount of free memory for the candidates). If a
+ * candidate comparison function is provided, the candidate with fewer
+ * nodes that is found to be best according to what such fucntion says
+ * is returned. If no comparison function is passed, the very first
+ * candidate is.
+ * 2. The chosen candidate's nodemap should be utilized for computing the
+ * actual affinity of the domain which, given the current NUMA support
+ * in the hypervisor, is what determines the placement of the domain's
+ * vcpus and memory.
+ */
+
+typedef struct {
+ int nr_cpus, nr_nodes;
+ int nr_vcpus;
+ uint32_t free_memkb;
+ libxl_bitmap nodemap;
+} libxl__numa_candidate;
+
+/* Signature for the comparison function between two candidates */
+typedef int (*libxl__numa_candidate_cmpf)(const libxl__numa_candidate *c1,
+ const libxl__numa_candidate *c2);
+
+/*
+ * This looks for the best NUMA placement candidate satisfying some
+ * specific conditions. If min_nodes and/or max_nodes are not 0, their
+ * value is used to determine the minimum and maximum number of nodes the
+ * candidate can have. If they are 0, it means the candidate can contain
+ * from 1 node (min_nodes=0) to the total number of nodes of the host
+ * (max_ndoes=0). If min_free_memkb and/or min_cpus are not 0, the caller
+ * only wants candidates with at least the amount of free memory and the
+ * number of cpus they specify, respectively. If they are 0, the
+ * candidates' free memory and/or number of cpus won't be checked at all.
+ *
+ * Candidates are compared among each others by calling numa_cmpf(), which
+ * is where the heuristics for determining which candidate is the best
+ * one is actually implemented. The only bit of it that is hardcoded in
+ * this function is the fact that candidates with fewer nodes are always
+ * preferrable.
+ *
+ * If at least one suitable candidate is found, it is returned in cndt_out,
+ * cndt_found is set to one, and the function returns successfully. On the
+ * other hand, if not even one single candidate can be found, the function
+ * still returns successfully but cndt_found will be zero.
+ *
+ * It is up to the function to properly allocate cndt_out (by calling
+ * libxl__numa_candidate_alloc()), while it is the caller that should init
+ * (libxl__numa_candidate_init()) and free (libxl__numa_candidate_dispose())
+ * it.
+ */
+_hidden int libxl__get_numa_candidate(libxl__gc *gc,
+ uint32_t min_free_memkb, int min_cpus,
+ int min_nodes, int max_nodes,
+ libxl__numa_candidate_cmpf numa_cmpf,
+ libxl__numa_candidate *cndt_out,
+ int *cndt_found);
+
+/* Initialization, allocation and deallocation for placement candidates */
+static inline void libxl__numa_candidate_init(libxl__numa_candidate *cndt)
+{
+ cndt->free_memkb = 0;
+ cndt->nr_cpus = cndt->nr_nodes = cndt->nr_vcpus = 0;
+ libxl_bitmap_init(&cndt->nodemap);
+}
+
+static inline int libxl__numa_candidate_alloc(libxl__gc *gc,
+ libxl__numa_candidate *cndt)
+{
+ return libxl_node_bitmap_alloc(CTX, &cndt->nodemap, 0);
+}
+static inline void libxl__numa_candidate_dispose(libxl__numa_candidate *cndt)
+{
+ libxl_bitmap_dispose(&cndt->nodemap);
+}
+
+/* Retrieve (in nodemap) the node map associated to placement candidate cndt */
+static inline
+void libxl__numa_candidate_get_nodemap(libxl__gc *gc,
+ const libxl__numa_candidate *cndt,
+ libxl_bitmap *nodemap)
+{
+ libxl_bitmap_copy(CTX, nodemap, &cndt->nodemap);
+}
+/* Set the node map of placement candidate cndt to match nodemap */
+static inline
+void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
+ libxl__numa_candidate *cndt,
+ const libxl_bitmap *nodemap)
+{
+ libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
+}
/*
* Inserts "elm_new" into the sorted list "head".
diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
new file mode 100644
index 0000000000..d95ed2af09
--- /dev/null
+++ b/tools/libxl/libxl_numa.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (C) 2012 Citrix Ltd.
+ * Author Dario Faggioli <dario.faggioli@citrix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include <glob.h>
+
+#include "libxl_internal.h"
+
+/*
+ * What follows are helpers for generating all the k-combinations
+ * without repetitions of a set S with n elements in it. Formally
+ * speaking, they are subsets of k distinct elements of S and, if
+ * S is n elements big, the number of k-combinations is equal to the
+ * binomial coefficient C(n k)=n!/(k! * (n - k)!).
+ *
+ * The various subset are generated one after the other by calling
+ * comb_init() first, and, after that, comb_next()
+ * C(n k)-1 times. An iterator is used to store the current status
+ * of the whole generation operation (i.e., basically, the last
+ * combination that has been generated). As soon as all combinations
+ * have been generated, comb_next() will start returning 0 instead of
+ * 1. The same instance of the iterator and the same values for
+ * n and k _must_ be used for each call (if that doesn't happen, the
+ * result is unspecified).
+ *
+ * The algorithm is a well known one (see, for example, D. Knuth's "The
+ * Art of Computer Programming - Volume 4, Fascicle 3" and it produces
+ * the combinations in such a way that they (well, more precisely, their
+ * indexes it the array/map representing the set) come with lexicographic
+ * ordering.
+ *
+ * For example, with n = 5 and k = 3, calling comb_init()
+ * will generate { 0, 1, 2 }, while subsequent valid calls to
+ * comb_next() will produce the following:
+ * { { 0, 1, 3 }, { 0, 1, 4 },
+ * { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 4 },
+ * { 1, 2, 3 }, { 1, 2, 4 }, { 1, 3, 4 },
+ * { 2, 3, 4 } }
+ *
+ * This is used by the automatic NUMA placement logic below.
+ */
+typedef int* comb_iter_t;
+
+static int comb_init(libxl__gc *gc, comb_iter_t *it, int n, int k)
+{
+ comb_iter_t new_iter;
+ int i;
+
+ if (n < k)
+ return 0;
+
+ /* First set is always { 0, 1, 2, ..., k-1 } */
+ GCNEW_ARRAY(new_iter, k);
+ for (i = 0; i < k; i++)
+ new_iter[i] = i;
+
+ *it = new_iter;
+ return 1;
+}
+
+static int comb_next(comb_iter_t it, int n, int k)
+{
+ int i;
+
+ /*
+ * The idea here is to find the leftmost element from where
+ * we should start incrementing the indexes of the iterator.
+ * This means looking for the highest index that can be increased
+ * while still producing value smaller than n-1. In the example
+ * above, when dealing with { 0, 1, 4 }, such an element is the
+ * second one, as the third is already equal to 4 (which actually
+ * is n-1).
+ * Once we found from where to start, we increment that element
+ * and override the right-hand rest of the iterator with its
+ * successors, thus achieving lexicographic ordering.
+ *
+ * Regarding the termination of the generation process, when we
+ * manage in bringing n-k at the very first position of the iterator,
+ * we know that is the last valid combination ( { 2, 3, 4 }, with
+ * n - k = 5 - 2 = 2, in the example above), and thus we start
+ * returning 0 as soon as we cross that border.
+ */
+ for (i = k - 1; it[i] == n - k + i; i--) {
+ if (i <= 0)
+ return 0;
+ }
+ for (it[i]++, i++; i < k; i++)
+ it[i] = it[i - 1] + 1;
+ return 1;
+}
+
+/* NUMA automatic placement (see libxl_internal.h for details) */
+
+/*
+ * This function turns a k-combination iterator into a node map.
+ * This means the bits in the node map corresponding to the indexes
+ * of the given combination are the ones that will be set.
+ * For example, if the iterator represents the combination { 0, 2, 4},
+ * the node map will have bits #0, #2 and #4 set.
+ */
+static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *nodemap, int k)
+{
+ int i;
+
+ libxl_bitmap_set_none(nodemap);
+ for (i = 0; i < k; i++)
+ libxl_bitmap_set(nodemap, it[i]);
+}
+
+/* Retrieve the number of cpus that the nodes that are part of the nodemap
+ * span. */
+static int nodemap_to_nr_cpus(libxl_cputopology *tinfo, int nr_cpus,
+ const libxl_bitmap *nodemap)
+{
+ int i, nodes_cpus = 0;
+
+ for (i = 0; i < nr_cpus; i++) {
+ if (libxl_bitmap_test(nodemap, tinfo[i].node))
+ nodes_cpus++;
+ }
+ return nodes_cpus;
+}
+
+/* Retrieve the amount of free memory within the nodemap */
+static uint32_t nodemap_to_free_memkb(libxl_numainfo *ninfo,
+ libxl_bitmap *nodemap)
+{
+ uint32_t free_memkb = 0;
+ int i;
+
+ libxl_for_each_set_bit(i, *nodemap)
+ free_memkb += ninfo[i].free / 1024;
+
+ return free_memkb;
+}
+
+/* Retrieve the number of vcpus able to run on the cpus of the nodes
+ * that are part of the nodemap. */
+static int nodemap_to_nr_vcpus(libxl__gc *gc, libxl_cputopology *tinfo,
+ const libxl_bitmap *nodemap)
+{
+ libxl_dominfo *dinfo = NULL;
+ libxl_bitmap vcpu_nodemap;
+ int nr_doms, nr_cpus;
+ int nr_vcpus = 0;
+ int i, j, k;
+
+ dinfo = libxl_list_domain(CTX, &nr_doms);
+ if (dinfo == NULL)
+ return ERROR_FAIL;
+
+ if (libxl_node_bitmap_alloc(CTX, &vcpu_nodemap, 0) < 0) {
+ libxl_dominfo_list_free(dinfo, nr_doms);
+ return ERROR_FAIL;
+ }
+
+ for (i = 0; i < nr_doms; i++) {
+ libxl_vcpuinfo *vinfo;
+ int nr_dom_vcpus;
+
+ vinfo = libxl_list_vcpu(CTX, dinfo[i].domid, &nr_dom_vcpus, &nr_cpus);
+ if (vinfo == NULL)
+ continue;
+
+ /* For each vcpu of each domain ... */
+ for (j = 0; j < nr_dom_vcpus; j++) {
+
+ /* Build up a map telling on which nodes the vcpu is runnable on */
+ libxl_bitmap_set_none(&vcpu_nodemap);
+ libxl_for_each_set_bit(k, vinfo[j].cpumap)
+ libxl_bitmap_set(&vcpu_nodemap, tinfo[k].node);
+
+ /* And check if that map has any intersection with our nodemap */
+ libxl_for_each_set_bit(k, vcpu_nodemap) {
+ if (libxl_bitmap_test(nodemap, k)) {
+ nr_vcpus++;
+ break;
+ }
+ }
+ }
+
+ libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus);
+ }
+
+ libxl_bitmap_dispose(&vcpu_nodemap);
+ libxl_dominfo_list_free(dinfo, nr_doms);
+ return nr_vcpus;
+}
+
+/*
+ * This function tries to figure out if the host has a consistent number
+ * of cpus along all its NUMA nodes. In fact, if that is the case, we can
+ * calculate the minimum number of nodes needed for a domain by just
+ * dividing its total number of vcpus by this value computed here.
+ * However, we are not allowed to assume that all the nodes have the
+ * same number of cpus. Therefore, in case discrepancies among different
+ * nodes are found, this function just returns 0, for the caller to know
+ * it shouldn't rely on this 'optimization', and sort out things in some
+ * other way (by doing something basic, like starting trying with
+ * candidates with just one node).
+ */
+static int count_cpus_per_node(libxl_cputopology *tinfo, int nr_cpus,
+ libxl_numainfo *ninfo, int nr_nodes)
+{
+ int cpus_per_node = 0;
+ int j, i;
+
+ /* This makes sense iff # of PCPUs is the same for all nodes */
+ for (j = 0; j < nr_nodes; j++) {
+ int curr_cpus = 0;
+
+ for (i = 0; i < nr_cpus; i++) {
+ if (tinfo[i].node == j)
+ curr_cpus++;
+ }
+ /* So, if the above does not hold, turn the whole thing off! */
+ cpus_per_node = cpus_per_node == 0 ? curr_cpus : cpus_per_node;
+ if (cpus_per_node != curr_cpus)
+ return 0;
+ }
+ return cpus_per_node;
+}
+
+/*
+ * Looks for the placement candidates that satisfyies some specific
+ * conditions and return the best one according to the provided
+ * comparison function.
+ */
+int libxl__get_numa_candidate(libxl__gc *gc,
+ uint32_t min_free_memkb, int min_cpus,
+ int min_nodes, int max_nodes,
+ libxl__numa_candidate_cmpf numa_cmpf,
+ libxl__numa_candidate *cndt_out,
+ int *cndt_found)
+{
+ libxl__numa_candidate new_cndt;
+ libxl_cputopology *tinfo = NULL;
+ libxl_numainfo *ninfo = NULL;
+ int nr_nodes = 0, nr_cpus = 0;
+ libxl_bitmap nodemap;
+ int rc = 0;
+
+ libxl_bitmap_init(&nodemap);
+ libxl__numa_candidate_init(&new_cndt);
+
+ /* Get platform info and prepare the map for testing the combinations */
+ ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+ if (ninfo == NULL)
+ return ERROR_FAIL;
+
+ /*
+ * The good thing about this solution is that it is based on heuristics
+ * (implemented in numa_cmpf() ), but we at least can evaluate it on
+ * all the possible placement candidates. That can happen because the
+ * number of nodes present in current NUMA systems is quite small.
+ * In fact, even if a sum of binomials is involved, if the system has
+ * up to 16 nodes it "only" takes 65535 steps. This is fine, as the
+ * number of nodes the biggest NUMA systems provide at the time of this
+ * writing is 8 (and it will probably continue to be so for a while).
+ * However, computanional complexity would explode on systems bigger
+ * than that, and it's really important we avoid trying to run this
+ * on monsters with 32, 64 or more nodes (if they ever pop into being).
+ * Therefore, here it comes a safety catch that disables the algorithm
+ * for the cases when it wouldn't work well.
+ */
+ if (nr_nodes > 16) {
+ /* Log we did nothing and return 0, as no real error occurred */
+ LOG(WARN, "System has %d NUMA nodes, which is too big for the "
+ "placement algorithm to work effectively: skipping it. "
+ "Consider manually pinning the vCPUs and/or looking at "
+ "cpupools for manually partitioning the system.",
+ nr_nodes);
+ *cndt_found = 0;
+ goto out;
+ }
+
+ tinfo = libxl_get_cpu_topology(CTX, &nr_cpus);
+ if (tinfo == NULL) {
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0);
+ if (rc)
+ goto out;
+ rc = libxl__numa_candidate_alloc(gc, &new_cndt);
+ if (rc)
+ goto out;
+
+ /*
+ * If the minimum number of NUMA nodes is not explicitly specified
+ * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes
+ * from where to start generating candidates, if possible (or just start
+ * from 1 otherwise). The maximum number of nodes should not exceed the
+ * number of existent NUMA nodes on the host, or the candidate generation
+ * won't work properly.
+ */
+ if (!min_nodes) {
+ int cpus_per_node;
+
+ cpus_per_node = count_cpus_per_node(tinfo, nr_cpus, ninfo, nr_nodes);
+ if (cpus_per_node == 0)
+ min_nodes = 1;
+ else
+ min_nodes = (min_cpus + cpus_per_node - 1) / cpus_per_node;
+ }
+ if (min_nodes > nr_nodes)
+ min_nodes = nr_nodes;
+ if (!max_nodes || max_nodes > nr_nodes)
+ max_nodes = nr_nodes;
+ if (min_nodes > max_nodes) {
+ LOG(ERROR, "Inconsistent minimum or maximum number of guest nodes");
+ rc = ERROR_INVAL;
+ goto out;
+ }
+
+ /* This is up to the caller to be disposed */
+ rc = libxl__numa_candidate_alloc(gc, cndt_out);
+ if (rc)
+ goto out;
+
+ /*
+ * Consider all the combinations with sizes in [min_nodes, max_nodes]
+ * (see comb_init() and comb_next()). Note that, since the fewer the
+ * number of nodes the better, it is guaranteed that any candidate
+ * found during the i-eth step will be better than any other one we
+ * could find during the (i+1)-eth and all the subsequent steps (they
+ * all will have more nodes). It's thus pointless to keep going if
+ * we already found something.
+ */
+ *cndt_found = 0;
+ while (min_nodes <= max_nodes && *cndt_found == 0) {
+ comb_iter_t comb_iter;
+ int comb_ok;
+
+ /*
+ * And here it is. Each step of this cycle generates a combination of
+ * nodes as big as min_nodes mandates. Each of these combinations is
+ * checked against the constraints provided by the caller (namely,
+ * amount of free memory and number of cpus) and it can concur to
+ * become our best placement iff it passes the check.
+ */
+ for (comb_ok = comb_init(gc, &comb_iter, nr_nodes, min_nodes); comb_ok;
+ comb_ok = comb_next(comb_iter, nr_nodes, min_nodes)) {
+ uint32_t nodes_free_memkb;
+ int nodes_cpus;
+
+ comb_get_nodemap(comb_iter, &nodemap, min_nodes);
+
+ /* If there is not enough memory in this combination, skip it
+ * and go generating the next one... */
+ nodes_free_memkb = nodemap_to_free_memkb(ninfo, &nodemap);
+ if (min_free_memkb && nodes_free_memkb < min_free_memkb)
+ continue;
+
+ /* And the same applies if this combination is short in cpus */
+ nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, &nodemap);
+ if (min_cpus && nodes_cpus < min_cpus)
+ continue;
+
+ /*
+ * Conditions are met, we can compare this candidate with the
+ * current best one (if any).
+ */
+ libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap);
+ new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap);
+ new_cndt.free_memkb = nodes_free_memkb;
+ new_cndt.nr_nodes = min_nodes;
+ new_cndt.nr_cpus = nodes_cpus;
+
+ /*
+ * Check if the new candidate we is better the what we found up
+ * to now by means of the comparison function. If no comparison
+ * function is provided, just return as soon as we find our first
+ * candidate.
+ */
+ if (*cndt_found == 0 || numa_cmpf(&new_cndt, cndt_out) < 0) {
+ *cndt_found = 1;
+
+ LOG(DEBUG, "New best NUMA placement candidate found: "
+ "nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, "
+ "free_memkb=%"PRIu32"", min_nodes, new_cndt.nr_cpus,
+ new_cndt.nr_vcpus, new_cndt.free_memkb / 1024);
+
+ libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap);
+ cndt_out->nr_vcpus = new_cndt.nr_vcpus;
+ cndt_out->free_memkb = new_cndt.free_memkb;
+ cndt_out->nr_nodes = new_cndt.nr_nodes;
+ cndt_out->nr_cpus = new_cndt.nr_cpus;
+
+ if (numa_cmpf == NULL)
+ break;
+ }
+ }
+ min_nodes++;
+ }
+
+ if (*cndt_found == 0)
+ LOG(NOTICE, "NUMA placement failed, performance might be affected");
+
+ out:
+ libxl_bitmap_dispose(&nodemap);
+ libxl__numa_candidate_dispose(&new_cndt);
+ libxl_numainfo_list_free(ninfo, nr_nodes);
+ libxl_cputopology_list_free(tinfo, nr_cpus);
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 574d8d2f6d..c1815c6c00 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -249,6 +249,7 @@ libxl_domain_build_info = Struct("domain_build_info",[
("max_vcpus", integer),
("avail_vcpus", libxl_bitmap),
("cpumap", libxl_bitmap),
+ ("numa_placement", libxl_defbool),
("tsc_mode", libxl_tsc_mode),
("max_memkb", MemKB),
("target_memkb", MemKB),
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 8c890772ec..9912200def 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -696,6 +696,9 @@ static void parse_config_data(const char *config_source,
vcpu_to_pcpu[n_cpus] = i;
n_cpus++;
}
+
+ /* We have a cpumap, disable automatic placement */
+ libxl_defbool_set(&b_info->numa_placement, false);
}
else if (!xlu_cfg_get_string (config, "cpus", &buf, 0)) {
char *buf2 = strdup(buf);
@@ -709,6 +712,8 @@ static void parse_config_data(const char *config_source,
if (vcpupin_parse(buf2, &b_info->cpumap))
exit(1);
free(buf2);
+
+ libxl_defbool_set(&b_info->numa_placement, false);
}
if (!xlu_cfg_get_long (config, "memory", &l, 0)) {