diff options
-rw-r--r-- | docs/man/xl.cfg.pod.5 | 10 | ||||
-rw-r--r-- | tools/libxl/Makefile | 2 | ||||
-rw-r--r-- | tools/libxl/libxl_create.c | 2 | ||||
-rw-r--r-- | tools/libxl/libxl_dom.c | 112 | ||||
-rw-r--r-- | tools/libxl/libxl_internal.h | 117 | ||||
-rw-r--r-- | tools/libxl/libxl_numa.c | 428 | ||||
-rw-r--r-- | tools/libxl/libxl_types.idl | 1 | ||||
-rw-r--r-- | tools/libxl/xl_cmdimpl.c | 5 |
8 files changed, 674 insertions, 3 deletions
diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5 index 314a03e339..8aaff1fec9 100644 --- a/docs/man/xl.cfg.pod.5 +++ b/docs/man/xl.cfg.pod.5 @@ -111,8 +111,8 @@ created online and the remainder will be offline. =item B<cpus="CPU-LIST"> -List of which cpus the guest is allowed to use. Default behavior is -`all cpus`. A C<CPU-LIST> may be specified as follows: +List of which cpus the guest is allowed to use. By default xl will pick +some cpus on its own (see below). A C<CPU-LIST> may be specified as follows: =over 4 @@ -132,6 +132,12 @@ run on cpu #3 of the host. =back +If this option is not specified, libxl automatically tries to place the new +domain on the host's NUMA nodes (provided the host has more than one NUMA +node) by pinning it to the cpus of those nodes. A heuristic approach is +utilized with the goals of maximizing performance for the domain and, at +the same time, achieving efficient utilization of the host's CPUs and RAM. + =item B<cpu_weight=WEIGHT> A domain with a weight of 512 will get twice as much CPU as a domain diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile index 48f352eb4e..424a7ee755 100644 --- a/tools/libxl/Makefile +++ b/tools/libxl/Makefile @@ -66,7 +66,7 @@ LIBXL_LIBS += -lyajl LIBXL_OBJS = flexarray.o libxl.o libxl_create.o libxl_dm.o libxl_pci.o \ libxl_dom.o libxl_exec.o libxl_xshelp.o libxl_device.o \ libxl_internal.o libxl_utils.o libxl_uuid.o \ - libxl_json.o libxl_aoutils.o \ + libxl_json.o libxl_aoutils.o libxl_numa.o \ libxl_save_callout.o _libxl_save_msgs_callout.o \ libxl_qmp.o libxl_event.o libxl_fork.o $(LIBXL_OBJS-y) LIBXL_OBJS += _libxl_types.o libxl_flask.o _libxl_types_internal.o diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c index 0c81a60972..3c950bab9c 100644 --- a/tools/libxl/libxl_create.c +++ b/tools/libxl/libxl_create.c @@ -215,6 +215,8 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc, libxl_bitmap_set_any(&b_info->cpumap); } + libxl_defbool_setdefault(&b_info->numa_placement, true); + if (b_info->max_memkb == LIBXL_MEMKB_DEFAULT) b_info->max_memkb = 32 * 1024; if (b_info->target_memkb == LIBXL_MEMKB_DEFAULT) diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c index bd4c9b448d..e13fb49f7f 100644 --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -98,6 +98,94 @@ out: return sched; } +/* + * Two NUMA placement candidates are compared by means of the following + * heuristics: + + * - the number of vcpus runnable on the candidates is considered, and + * candidates with fewer of them are preferred. If two candidate have + * the same number of runnable vcpus, + * - the amount of free memory in the candidates is considered, and the + * candidate with greater amount of it is preferred. + * + * In fact, leaving larger memory holes, maximizes the probability of being + * able to put other domains on the node. That hopefully means many domains + * will benefit from local memory accesses, but also introduces the risk of + * overloading large (from a memory POV) nodes. That's right the effect + * that counting the vcpus able to run on the nodes tries to prevent. + * + * Note that this completely ignore the number of nodes each candidate span, + * as the fact that fewer nodes is better is already accounted for in the + * algorithm. + */ +static int numa_cmpf(const libxl__numa_candidate *c1, + const libxl__numa_candidate *c2) +{ + if (c1->nr_vcpus != c2->nr_vcpus) + return c1->nr_vcpus - c2->nr_vcpus; + + return c2->free_memkb - c1->free_memkb; +} + +/* The actual automatic NUMA placement routine */ +static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info) +{ + int found; + libxl__numa_candidate candidate; + libxl_bitmap candidate_nodemap; + libxl_cpupoolinfo *pinfo; + int nr_pools, rc = 0; + uint32_t memkb; + + libxl__numa_candidate_init(&candidate); + libxl_bitmap_init(&candidate_nodemap); + + /* First of all, if cpupools are in use, better not to mess with them */ + pinfo = libxl_list_cpupool(CTX, &nr_pools); + if (!pinfo) + return ERROR_FAIL; + if (nr_pools > 1) { + LOG(NOTICE, "Skipping NUMA placement as cpupools are in use"); + goto out; + } + + rc = libxl_domain_need_memory(CTX, info, &memkb); + if (rc) + goto out; + if (libxl_node_bitmap_alloc(CTX, &candidate_nodemap, 0)) { + rc = ERROR_FAIL; + goto out; + } + + /* Find the best candidate with enough free memory and at least + * as much pcpus as the domain has vcpus. */ + rc = libxl__get_numa_candidate(gc, memkb, info->max_vcpus, 0, 0, + numa_cmpf, &candidate, &found); + if (rc) + goto out; + + /* Not even a suitable placement candidate! Let's just don't touch the + * domain's info->cpumap. It will have affinity with all nodes/cpus. */ + if (found == 0) + goto out; + + /* Map the candidate's node map to the domain's info->cpumap */ + libxl__numa_candidate_get_nodemap(gc, &candidate, &candidate_nodemap); + rc = libxl_nodemap_to_cpumap(CTX, &candidate_nodemap, &info->cpumap); + if (rc) + goto out; + + LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and " + "%"PRIu32" KB free selected", candidate.nr_nodes, + candidate.nr_cpus, candidate.free_memkb / 1024); + + out: + libxl__numa_candidate_dispose(&candidate); + libxl_bitmap_dispose(&candidate_nodemap); + libxl_cpupoolinfo_list_free(pinfo, nr_pools); + return rc; +} + int libxl__build_pre(libxl__gc *gc, uint32_t domid, libxl_domain_build_info *info, libxl__domain_build_state *state) { @@ -107,7 +195,31 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, uint32_t rtc_timeoffset; xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus); + + /* + * Check if the domain has any CPU affinity. If not, try to build + * up one. In case numa_place_domain() find at least a suitable + * candidate, it will affect info->cpumap accordingly; if it + * does not, it just leaves it as it is. This means (unless + * some weird error manifests) the subsequent call to + * libxl_set_vcpuaffinity_all() will do the actual placement, + * whatever that turns out to be. + */ + if (libxl_defbool_val(info->numa_placement)) { + int rc; + + if (!libxl_bitmap_is_full(&info->cpumap)) { + LOG(ERROR, "Can run NUMA placement only if no vcpu " + "affinity is specified"); + return ERROR_INVAL; + } + + rc = numa_place_domain(gc, info); + if (rc) + return rc; + } libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap); + xc_domain_setmaxmem(ctx->xch, domid, info->target_memkb + LIBXL_MAXMEM_CONSTANT); if (info->type == LIBXL_DOMAIN_TYPE_PV) xc_domain_set_memmap_limit(ctx->xch, domid, diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h index 1b9b417664..fd1b0cedfa 100644 --- a/tools/libxl/libxl_internal.h +++ b/tools/libxl/libxl_internal.h @@ -2497,6 +2497,123 @@ static inline void libxl__ctx_unlock(libxl_ctx *ctx) { #define CTX_LOCK (libxl__ctx_lock(CTX)) #define CTX_UNLOCK (libxl__ctx_unlock(CTX)) +/* + * Automatic NUMA placement + * + * These functions and data structures deal with the initial placement of a + * domain onto the host NUMA nodes. + * + * The key concept here is the one of "NUMA placement candidate", which is + * basically a set of nodes whose characteristics have been successfully + * checked against some specific requirements. More precisely, a candidate + * is the nodemap associated with one of the possible subset of the host + * NUMA nodes providing a certain amount of free memory, or a given number + * of cpus, or even both (depending in what the caller wants). For + * convenience of use, some of this information are stored within the + * candidate itself, instead of always being dynamically computed. A single + * node can be valid placement candidate, as well as it is possible for a + * candidate to contain all the nodes of the host. The fewer nodes there + * are in a candidate, the better performance a domain placed onto it + * should get (at least from a NUMA point of view). For instance, looking + * for a numa candidates with 2GB of free memory means we want the subsets + * of the host NUMA nodes with, cumulatively, at least 2GB of free memory. + * This condition can be satisfied by just one particular node, or it may + * require more nodes, depending on the characteristics of the host, on how + * many domains have been created already, on how big they are, etc. + * + * The intended usage is as follows: + * 1. first of all, call libxl__get_numa_candidates(), and specify the + * proper constraints to it (e.g., the amount of memory a domain need + * as the minimum amount of free memory for the candidates). If a + * candidate comparison function is provided, the candidate with fewer + * nodes that is found to be best according to what such fucntion says + * is returned. If no comparison function is passed, the very first + * candidate is. + * 2. The chosen candidate's nodemap should be utilized for computing the + * actual affinity of the domain which, given the current NUMA support + * in the hypervisor, is what determines the placement of the domain's + * vcpus and memory. + */ + +typedef struct { + int nr_cpus, nr_nodes; + int nr_vcpus; + uint32_t free_memkb; + libxl_bitmap nodemap; +} libxl__numa_candidate; + +/* Signature for the comparison function between two candidates */ +typedef int (*libxl__numa_candidate_cmpf)(const libxl__numa_candidate *c1, + const libxl__numa_candidate *c2); + +/* + * This looks for the best NUMA placement candidate satisfying some + * specific conditions. If min_nodes and/or max_nodes are not 0, their + * value is used to determine the minimum and maximum number of nodes the + * candidate can have. If they are 0, it means the candidate can contain + * from 1 node (min_nodes=0) to the total number of nodes of the host + * (max_ndoes=0). If min_free_memkb and/or min_cpus are not 0, the caller + * only wants candidates with at least the amount of free memory and the + * number of cpus they specify, respectively. If they are 0, the + * candidates' free memory and/or number of cpus won't be checked at all. + * + * Candidates are compared among each others by calling numa_cmpf(), which + * is where the heuristics for determining which candidate is the best + * one is actually implemented. The only bit of it that is hardcoded in + * this function is the fact that candidates with fewer nodes are always + * preferrable. + * + * If at least one suitable candidate is found, it is returned in cndt_out, + * cndt_found is set to one, and the function returns successfully. On the + * other hand, if not even one single candidate can be found, the function + * still returns successfully but cndt_found will be zero. + * + * It is up to the function to properly allocate cndt_out (by calling + * libxl__numa_candidate_alloc()), while it is the caller that should init + * (libxl__numa_candidate_init()) and free (libxl__numa_candidate_dispose()) + * it. + */ +_hidden int libxl__get_numa_candidate(libxl__gc *gc, + uint32_t min_free_memkb, int min_cpus, + int min_nodes, int max_nodes, + libxl__numa_candidate_cmpf numa_cmpf, + libxl__numa_candidate *cndt_out, + int *cndt_found); + +/* Initialization, allocation and deallocation for placement candidates */ +static inline void libxl__numa_candidate_init(libxl__numa_candidate *cndt) +{ + cndt->free_memkb = 0; + cndt->nr_cpus = cndt->nr_nodes = cndt->nr_vcpus = 0; + libxl_bitmap_init(&cndt->nodemap); +} + +static inline int libxl__numa_candidate_alloc(libxl__gc *gc, + libxl__numa_candidate *cndt) +{ + return libxl_node_bitmap_alloc(CTX, &cndt->nodemap, 0); +} +static inline void libxl__numa_candidate_dispose(libxl__numa_candidate *cndt) +{ + libxl_bitmap_dispose(&cndt->nodemap); +} + +/* Retrieve (in nodemap) the node map associated to placement candidate cndt */ +static inline +void libxl__numa_candidate_get_nodemap(libxl__gc *gc, + const libxl__numa_candidate *cndt, + libxl_bitmap *nodemap) +{ + libxl_bitmap_copy(CTX, nodemap, &cndt->nodemap); +} +/* Set the node map of placement candidate cndt to match nodemap */ +static inline +void libxl__numa_candidate_put_nodemap(libxl__gc *gc, + libxl__numa_candidate *cndt, + const libxl_bitmap *nodemap) +{ + libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap); +} /* * Inserts "elm_new" into the sorted list "head". diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c new file mode 100644 index 0000000000..d95ed2af09 --- /dev/null +++ b/tools/libxl/libxl_numa.c @@ -0,0 +1,428 @@ +/* + * Copyright (C) 2012 Citrix Ltd. + * Author Dario Faggioli <dario.faggioli@citrix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include "libxl_osdeps.h" /* must come before any other headers */ + +#include <glob.h> + +#include "libxl_internal.h" + +/* + * What follows are helpers for generating all the k-combinations + * without repetitions of a set S with n elements in it. Formally + * speaking, they are subsets of k distinct elements of S and, if + * S is n elements big, the number of k-combinations is equal to the + * binomial coefficient C(n k)=n!/(k! * (n - k)!). + * + * The various subset are generated one after the other by calling + * comb_init() first, and, after that, comb_next() + * C(n k)-1 times. An iterator is used to store the current status + * of the whole generation operation (i.e., basically, the last + * combination that has been generated). As soon as all combinations + * have been generated, comb_next() will start returning 0 instead of + * 1. The same instance of the iterator and the same values for + * n and k _must_ be used for each call (if that doesn't happen, the + * result is unspecified). + * + * The algorithm is a well known one (see, for example, D. Knuth's "The + * Art of Computer Programming - Volume 4, Fascicle 3" and it produces + * the combinations in such a way that they (well, more precisely, their + * indexes it the array/map representing the set) come with lexicographic + * ordering. + * + * For example, with n = 5 and k = 3, calling comb_init() + * will generate { 0, 1, 2 }, while subsequent valid calls to + * comb_next() will produce the following: + * { { 0, 1, 3 }, { 0, 1, 4 }, + * { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 4 }, + * { 1, 2, 3 }, { 1, 2, 4 }, { 1, 3, 4 }, + * { 2, 3, 4 } } + * + * This is used by the automatic NUMA placement logic below. + */ +typedef int* comb_iter_t; + +static int comb_init(libxl__gc *gc, comb_iter_t *it, int n, int k) +{ + comb_iter_t new_iter; + int i; + + if (n < k) + return 0; + + /* First set is always { 0, 1, 2, ..., k-1 } */ + GCNEW_ARRAY(new_iter, k); + for (i = 0; i < k; i++) + new_iter[i] = i; + + *it = new_iter; + return 1; +} + +static int comb_next(comb_iter_t it, int n, int k) +{ + int i; + + /* + * The idea here is to find the leftmost element from where + * we should start incrementing the indexes of the iterator. + * This means looking for the highest index that can be increased + * while still producing value smaller than n-1. In the example + * above, when dealing with { 0, 1, 4 }, such an element is the + * second one, as the third is already equal to 4 (which actually + * is n-1). + * Once we found from where to start, we increment that element + * and override the right-hand rest of the iterator with its + * successors, thus achieving lexicographic ordering. + * + * Regarding the termination of the generation process, when we + * manage in bringing n-k at the very first position of the iterator, + * we know that is the last valid combination ( { 2, 3, 4 }, with + * n - k = 5 - 2 = 2, in the example above), and thus we start + * returning 0 as soon as we cross that border. + */ + for (i = k - 1; it[i] == n - k + i; i--) { + if (i <= 0) + return 0; + } + for (it[i]++, i++; i < k; i++) + it[i] = it[i - 1] + 1; + return 1; +} + +/* NUMA automatic placement (see libxl_internal.h for details) */ + +/* + * This function turns a k-combination iterator into a node map. + * This means the bits in the node map corresponding to the indexes + * of the given combination are the ones that will be set. + * For example, if the iterator represents the combination { 0, 2, 4}, + * the node map will have bits #0, #2 and #4 set. + */ +static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *nodemap, int k) +{ + int i; + + libxl_bitmap_set_none(nodemap); + for (i = 0; i < k; i++) + libxl_bitmap_set(nodemap, it[i]); +} + +/* Retrieve the number of cpus that the nodes that are part of the nodemap + * span. */ +static int nodemap_to_nr_cpus(libxl_cputopology *tinfo, int nr_cpus, + const libxl_bitmap *nodemap) +{ + int i, nodes_cpus = 0; + + for (i = 0; i < nr_cpus; i++) { + if (libxl_bitmap_test(nodemap, tinfo[i].node)) + nodes_cpus++; + } + return nodes_cpus; +} + +/* Retrieve the amount of free memory within the nodemap */ +static uint32_t nodemap_to_free_memkb(libxl_numainfo *ninfo, + libxl_bitmap *nodemap) +{ + uint32_t free_memkb = 0; + int i; + + libxl_for_each_set_bit(i, *nodemap) + free_memkb += ninfo[i].free / 1024; + + return free_memkb; +} + +/* Retrieve the number of vcpus able to run on the cpus of the nodes + * that are part of the nodemap. */ +static int nodemap_to_nr_vcpus(libxl__gc *gc, libxl_cputopology *tinfo, + const libxl_bitmap *nodemap) +{ + libxl_dominfo *dinfo = NULL; + libxl_bitmap vcpu_nodemap; + int nr_doms, nr_cpus; + int nr_vcpus = 0; + int i, j, k; + + dinfo = libxl_list_domain(CTX, &nr_doms); + if (dinfo == NULL) + return ERROR_FAIL; + + if (libxl_node_bitmap_alloc(CTX, &vcpu_nodemap, 0) < 0) { + libxl_dominfo_list_free(dinfo, nr_doms); + return ERROR_FAIL; + } + + for (i = 0; i < nr_doms; i++) { + libxl_vcpuinfo *vinfo; + int nr_dom_vcpus; + + vinfo = libxl_list_vcpu(CTX, dinfo[i].domid, &nr_dom_vcpus, &nr_cpus); + if (vinfo == NULL) + continue; + + /* For each vcpu of each domain ... */ + for (j = 0; j < nr_dom_vcpus; j++) { + + /* Build up a map telling on which nodes the vcpu is runnable on */ + libxl_bitmap_set_none(&vcpu_nodemap); + libxl_for_each_set_bit(k, vinfo[j].cpumap) + libxl_bitmap_set(&vcpu_nodemap, tinfo[k].node); + + /* And check if that map has any intersection with our nodemap */ + libxl_for_each_set_bit(k, vcpu_nodemap) { + if (libxl_bitmap_test(nodemap, k)) { + nr_vcpus++; + break; + } + } + } + + libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); + } + + libxl_bitmap_dispose(&vcpu_nodemap); + libxl_dominfo_list_free(dinfo, nr_doms); + return nr_vcpus; +} + +/* + * This function tries to figure out if the host has a consistent number + * of cpus along all its NUMA nodes. In fact, if that is the case, we can + * calculate the minimum number of nodes needed for a domain by just + * dividing its total number of vcpus by this value computed here. + * However, we are not allowed to assume that all the nodes have the + * same number of cpus. Therefore, in case discrepancies among different + * nodes are found, this function just returns 0, for the caller to know + * it shouldn't rely on this 'optimization', and sort out things in some + * other way (by doing something basic, like starting trying with + * candidates with just one node). + */ +static int count_cpus_per_node(libxl_cputopology *tinfo, int nr_cpus, + libxl_numainfo *ninfo, int nr_nodes) +{ + int cpus_per_node = 0; + int j, i; + + /* This makes sense iff # of PCPUs is the same for all nodes */ + for (j = 0; j < nr_nodes; j++) { + int curr_cpus = 0; + + for (i = 0; i < nr_cpus; i++) { + if (tinfo[i].node == j) + curr_cpus++; + } + /* So, if the above does not hold, turn the whole thing off! */ + cpus_per_node = cpus_per_node == 0 ? curr_cpus : cpus_per_node; + if (cpus_per_node != curr_cpus) + return 0; + } + return cpus_per_node; +} + +/* + * Looks for the placement candidates that satisfyies some specific + * conditions and return the best one according to the provided + * comparison function. + */ +int libxl__get_numa_candidate(libxl__gc *gc, + uint32_t min_free_memkb, int min_cpus, + int min_nodes, int max_nodes, + libxl__numa_candidate_cmpf numa_cmpf, + libxl__numa_candidate *cndt_out, + int *cndt_found) +{ + libxl__numa_candidate new_cndt; + libxl_cputopology *tinfo = NULL; + libxl_numainfo *ninfo = NULL; + int nr_nodes = 0, nr_cpus = 0; + libxl_bitmap nodemap; + int rc = 0; + + libxl_bitmap_init(&nodemap); + libxl__numa_candidate_init(&new_cndt); + + /* Get platform info and prepare the map for testing the combinations */ + ninfo = libxl_get_numainfo(CTX, &nr_nodes); + if (ninfo == NULL) + return ERROR_FAIL; + + /* + * The good thing about this solution is that it is based on heuristics + * (implemented in numa_cmpf() ), but we at least can evaluate it on + * all the possible placement candidates. That can happen because the + * number of nodes present in current NUMA systems is quite small. + * In fact, even if a sum of binomials is involved, if the system has + * up to 16 nodes it "only" takes 65535 steps. This is fine, as the + * number of nodes the biggest NUMA systems provide at the time of this + * writing is 8 (and it will probably continue to be so for a while). + * However, computanional complexity would explode on systems bigger + * than that, and it's really important we avoid trying to run this + * on monsters with 32, 64 or more nodes (if they ever pop into being). + * Therefore, here it comes a safety catch that disables the algorithm + * for the cases when it wouldn't work well. + */ + if (nr_nodes > 16) { + /* Log we did nothing and return 0, as no real error occurred */ + LOG(WARN, "System has %d NUMA nodes, which is too big for the " + "placement algorithm to work effectively: skipping it. " + "Consider manually pinning the vCPUs and/or looking at " + "cpupools for manually partitioning the system.", + nr_nodes); + *cndt_found = 0; + goto out; + } + + tinfo = libxl_get_cpu_topology(CTX, &nr_cpus); + if (tinfo == NULL) { + rc = ERROR_FAIL; + goto out; + } + + rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0); + if (rc) + goto out; + rc = libxl__numa_candidate_alloc(gc, &new_cndt); + if (rc) + goto out; + + /* + * If the minimum number of NUMA nodes is not explicitly specified + * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes + * from where to start generating candidates, if possible (or just start + * from 1 otherwise). The maximum number of nodes should not exceed the + * number of existent NUMA nodes on the host, or the candidate generation + * won't work properly. + */ + if (!min_nodes) { + int cpus_per_node; + + cpus_per_node = count_cpus_per_node(tinfo, nr_cpus, ninfo, nr_nodes); + if (cpus_per_node == 0) + min_nodes = 1; + else + min_nodes = (min_cpus + cpus_per_node - 1) / cpus_per_node; + } + if (min_nodes > nr_nodes) + min_nodes = nr_nodes; + if (!max_nodes || max_nodes > nr_nodes) + max_nodes = nr_nodes; + if (min_nodes > max_nodes) { + LOG(ERROR, "Inconsistent minimum or maximum number of guest nodes"); + rc = ERROR_INVAL; + goto out; + } + + /* This is up to the caller to be disposed */ + rc = libxl__numa_candidate_alloc(gc, cndt_out); + if (rc) + goto out; + + /* + * Consider all the combinations with sizes in [min_nodes, max_nodes] + * (see comb_init() and comb_next()). Note that, since the fewer the + * number of nodes the better, it is guaranteed that any candidate + * found during the i-eth step will be better than any other one we + * could find during the (i+1)-eth and all the subsequent steps (they + * all will have more nodes). It's thus pointless to keep going if + * we already found something. + */ + *cndt_found = 0; + while (min_nodes <= max_nodes && *cndt_found == 0) { + comb_iter_t comb_iter; + int comb_ok; + + /* + * And here it is. Each step of this cycle generates a combination of + * nodes as big as min_nodes mandates. Each of these combinations is + * checked against the constraints provided by the caller (namely, + * amount of free memory and number of cpus) and it can concur to + * become our best placement iff it passes the check. + */ + for (comb_ok = comb_init(gc, &comb_iter, nr_nodes, min_nodes); comb_ok; + comb_ok = comb_next(comb_iter, nr_nodes, min_nodes)) { + uint32_t nodes_free_memkb; + int nodes_cpus; + + comb_get_nodemap(comb_iter, &nodemap, min_nodes); + + /* If there is not enough memory in this combination, skip it + * and go generating the next one... */ + nodes_free_memkb = nodemap_to_free_memkb(ninfo, &nodemap); + if (min_free_memkb && nodes_free_memkb < min_free_memkb) + continue; + + /* And the same applies if this combination is short in cpus */ + nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, &nodemap); + if (min_cpus && nodes_cpus < min_cpus) + continue; + + /* + * Conditions are met, we can compare this candidate with the + * current best one (if any). + */ + libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap); + new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap); + new_cndt.free_memkb = nodes_free_memkb; + new_cndt.nr_nodes = min_nodes; + new_cndt.nr_cpus = nodes_cpus; + + /* + * Check if the new candidate we is better the what we found up + * to now by means of the comparison function. If no comparison + * function is provided, just return as soon as we find our first + * candidate. + */ + if (*cndt_found == 0 || numa_cmpf(&new_cndt, cndt_out) < 0) { + *cndt_found = 1; + + LOG(DEBUG, "New best NUMA placement candidate found: " + "nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, " + "free_memkb=%"PRIu32"", min_nodes, new_cndt.nr_cpus, + new_cndt.nr_vcpus, new_cndt.free_memkb / 1024); + + libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap); + cndt_out->nr_vcpus = new_cndt.nr_vcpus; + cndt_out->free_memkb = new_cndt.free_memkb; + cndt_out->nr_nodes = new_cndt.nr_nodes; + cndt_out->nr_cpus = new_cndt.nr_cpus; + + if (numa_cmpf == NULL) + break; + } + } + min_nodes++; + } + + if (*cndt_found == 0) + LOG(NOTICE, "NUMA placement failed, performance might be affected"); + + out: + libxl_bitmap_dispose(&nodemap); + libxl__numa_candidate_dispose(&new_cndt); + libxl_numainfo_list_free(ninfo, nr_nodes); + libxl_cputopology_list_free(tinfo, nr_cpus); + return rc; +} + +/* + * Local variables: + * mode: C + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl index 574d8d2f6d..c1815c6c00 100644 --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -249,6 +249,7 @@ libxl_domain_build_info = Struct("domain_build_info",[ ("max_vcpus", integer), ("avail_vcpus", libxl_bitmap), ("cpumap", libxl_bitmap), + ("numa_placement", libxl_defbool), ("tsc_mode", libxl_tsc_mode), ("max_memkb", MemKB), ("target_memkb", MemKB), diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c index 8c890772ec..9912200def 100644 --- a/tools/libxl/xl_cmdimpl.c +++ b/tools/libxl/xl_cmdimpl.c @@ -696,6 +696,9 @@ static void parse_config_data(const char *config_source, vcpu_to_pcpu[n_cpus] = i; n_cpus++; } + + /* We have a cpumap, disable automatic placement */ + libxl_defbool_set(&b_info->numa_placement, false); } else if (!xlu_cfg_get_string (config, "cpus", &buf, 0)) { char *buf2 = strdup(buf); @@ -709,6 +712,8 @@ static void parse_config_data(const char *config_source, if (vcpupin_parse(buf2, &b_info->cpumap)) exit(1); free(buf2); + + libxl_defbool_set(&b_info->numa_placement, false); } if (!xlu_cfg_get_long (config, "memory", &l, 0)) { |