libxl: enable automatic placement of guests on NUMA nodes

If a domain does not have a VCPU affinity, try to pin it automatically to some PCPUs. This is done taking into account the NUMA characteristics of the host. In fact, we look for a combination of host's NUMA nodes with enough free memory and number of PCPUs for the new domain, and pin it to the VCPUs of those nodes. Deciding which placement is the best happens by means of some heuristics. For instance, smaller candidates are better, both from a domain perspective (less memory spreading among nodes) and from the entire system perspective (smaller memory fragmentation). In case of candidates of equal sizes (i.e., with the same number of nodes), the amount of free memory and the number of domains' vCPUs already pinned to the candidates' nodes are both considered. Very often, candidates with greater amount of memory are the one we wants, as this is good for keeping memory fragmentation under control. However, we do not want to overcommit some node too much, just because it has a lot of memory, and that's why the number of vCPUs must be accounted for. This all happens internally to libxl, and no API for driving the mechanism is provided for now. This matches what xend already does. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Ian Jackson <ian.jackson@eu.citrix.com> Tested-by: Andre Przywara <andre.przywara@amd.com> Committed-by: Ian Campbell <ian.campbell@citrix.com>
author: Dario Faggioli <dario.faggioli@citrix.com> 2012-07-26 15:41:54 +0100
committer: Dario Faggioli <dario.faggioli@citrix.com> 2012-07-26 15:41:54 +0100
commit: e98fee8390936477d369b9832ff9bb06594510c7 (patch)
tree: 485737b4c7bd3a9933296f3f52e6593f7b4f7570 /tools/libxl/libxl_numa.c
parent: 47ce0e3550a2d1ce8407e3643ceb595657ac4d79 (diff)
download: xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.gz
xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.bz2
xen-e98fee8390936477d369b9832ff9bb06594510c7.zip
1 files changed, 428 insertions, 0 deletions
diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
new file mode 100644
index 0000000000..d95ed2af09
--- /dev/null
+++ b/tools/libxl/libxl_numa.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (C) 2012      Citrix Ltd.
+ * Author Dario Faggioli <dario.faggioli@citrix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include <glob.h>
+
+#include "libxl_internal.h"
+
+/*
+ * What follows are helpers for generating all the k-combinations
+ * without repetitions of a set S with n elements in it. Formally
+ * speaking, they are subsets of k distinct elements of S and, if
+ * S is n elements big, the number of k-combinations is equal to the
+ * binomial coefficient C(n k)=n!/(k! * (n - k)!).
+ *
+ * The various subset are generated one after the other by calling
+ * comb_init() first, and, after that, comb_next()
+ * C(n k)-1 times. An iterator is used to store the current status
+ * of the whole generation operation (i.e., basically, the last
+ * combination that has been generated). As soon as all combinations
+ * have been generated, comb_next() will start returning 0 instead of
+ * 1. The same instance of the iterator and the same values for
+ * n and k _must_ be used for each call (if that doesn't happen, the
+ * result is unspecified).
+ *
+ * The algorithm is a well known one (see, for example, D. Knuth's "The
+ * Art of Computer Programming - Volume 4, Fascicle 3" and it produces
+ * the combinations in such a way that they (well, more precisely, their
+ * indexes it the array/map representing the set) come with lexicographic
+ * ordering.
+ *
+ * For example, with n = 5 and k = 3, calling comb_init()
+ * will generate { 0, 1, 2 }, while subsequent valid calls to
+ * comb_next() will produce the following:
+ * { { 0, 1, 3 }, { 0, 1, 4 },
+ *   { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 4 },
+ *   { 1, 2, 3 }, { 1, 2, 4 }, { 1, 3, 4 },
+ *   { 2, 3, 4 } }
+ *
+ * This is used by the automatic NUMA placement logic below.
+ */
+typedef int* comb_iter_t;
+
+static int comb_init(libxl__gc *gc, comb_iter_t *it, int n, int k)
+{
+    comb_iter_t new_iter;
+    int i;
+
+    if (n < k)
+        return 0;
+
+    /* First set is always { 0, 1, 2, ..., k-1 } */
+    GCNEW_ARRAY(new_iter, k);
+    for (i = 0; i < k; i++)
+        new_iter[i] = i;
+
+    *it = new_iter;
+    return 1;
+}
+
+static int comb_next(comb_iter_t it, int n, int k)
+{
+    int i;
+
+    /*
+     * The idea here is to find the leftmost element from where
+     * we should start incrementing the indexes of the iterator.
+     * This means looking for the highest index that can be increased
+     * while still producing value smaller than n-1. In the example
+     * above, when dealing with { 0, 1, 4 }, such an element is the
+     * second one, as the third is already equal to 4 (which actually
+     * is n-1).
+     * Once we found from where to start, we increment that element
+     * and override the right-hand rest of the iterator with its
+     * successors, thus achieving lexicographic ordering.
+     *
+     * Regarding the termination of the generation process, when we
+     * manage in bringing n-k at the very first position of the iterator,
+     * we know that is the last valid combination ( { 2, 3, 4 }, with
+     * n - k = 5 - 2 = 2, in the example above), and thus we start
+     * returning 0 as soon as we cross that border.
+     */
+    for (i = k - 1; it[i] == n - k + i; i--) {
+        if (i <= 0)
+            return 0;
+    }
+    for (it[i]++, i++; i < k; i++)
+        it[i] = it[i - 1] + 1;
+    return 1;
+}
+
+/* NUMA automatic placement (see libxl_internal.h for details) */
+
+/*
+ * This function turns a k-combination iterator into a node map.
+ * This means the bits in the node map corresponding to the indexes
+ * of the given combination are the ones that will be set.
+ * For example, if the iterator represents the combination { 0, 2, 4},
+ * the node map will have bits #0, #2 and #4 set.
+ */
+static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *nodemap, int k)
+{
+    int i;
+
+    libxl_bitmap_set_none(nodemap);
+    for (i = 0; i < k; i++)
+        libxl_bitmap_set(nodemap, it[i]);
+}
+
+/* Retrieve the number of cpus that the nodes that are part of the nodemap
+ * span. */
+static int nodemap_to_nr_cpus(libxl_cputopology *tinfo, int nr_cpus,
+                              const libxl_bitmap *nodemap)
+{
+    int i, nodes_cpus = 0;
+
+    for (i = 0; i < nr_cpus; i++) {
+        if (libxl_bitmap_test(nodemap, tinfo[i].node))
+            nodes_cpus++;
+    }
+    return nodes_cpus;
+}
+
+/* Retrieve the amount of free memory within the nodemap */
+static uint32_t nodemap_to_free_memkb(libxl_numainfo *ninfo,
+                                      libxl_bitmap *nodemap)
+{
+    uint32_t free_memkb = 0;
+    int i;
+
+    libxl_for_each_set_bit(i, *nodemap)
+        free_memkb += ninfo[i].free / 1024;
+
+    return free_memkb;
+}
+
+/* Retrieve the number of vcpus able to run on the cpus of the nodes
+ * that are part of the nodemap. */
+static int nodemap_to_nr_vcpus(libxl__gc *gc, libxl_cputopology *tinfo,
+                               const libxl_bitmap *nodemap)
+{
+    libxl_dominfo *dinfo = NULL;
+    libxl_bitmap vcpu_nodemap;
+    int nr_doms, nr_cpus;
+    int nr_vcpus = 0;
+    int i, j, k;
+
+    dinfo = libxl_list_domain(CTX, &nr_doms);
+    if (dinfo == NULL)
+        return ERROR_FAIL;
+
+    if (libxl_node_bitmap_alloc(CTX, &vcpu_nodemap, 0) < 0) {
+        libxl_dominfo_list_free(dinfo, nr_doms);
+        return ERROR_FAIL;
+    }
+
+    for (i = 0; i < nr_doms; i++) {
+        libxl_vcpuinfo *vinfo;
+        int nr_dom_vcpus;
+
+        vinfo = libxl_list_vcpu(CTX, dinfo[i].domid, &nr_dom_vcpus, &nr_cpus);
+        if (vinfo == NULL)
+            continue;
+
+        /* For each vcpu of each domain ... */
+        for (j = 0; j < nr_dom_vcpus; j++) {
+
+            /* Build up a map telling on which nodes the vcpu is runnable on */
+            libxl_bitmap_set_none(&vcpu_nodemap);
+            libxl_for_each_set_bit(k, vinfo[j].cpumap)
+                libxl_bitmap_set(&vcpu_nodemap, tinfo[k].node);
+
+            /* And check if that map has any intersection with our nodemap */
+            libxl_for_each_set_bit(k, vcpu_nodemap) {
+                if (libxl_bitmap_test(nodemap, k)) {
+                    nr_vcpus++;
+                    break;
+                }
+            }
+        }
+
+        libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus);
+    }
+
+    libxl_bitmap_dispose(&vcpu_nodemap);
+    libxl_dominfo_list_free(dinfo, nr_doms);
+    return nr_vcpus;
+}
+
+/*
+ * This function tries to figure out if the host has a consistent number
+ * of cpus along all its NUMA nodes. In fact, if that is the case, we can
+ * calculate the minimum number of nodes needed for a domain by just
+ * dividing its total number of vcpus by this value computed here.
+ * However, we are not allowed to assume that all the nodes have the
+ * same number of cpus. Therefore, in case discrepancies among different
+ * nodes are found, this function just returns 0, for the caller to know
+ * it shouldn't rely on this 'optimization', and sort out things in some
+ * other way (by doing something basic, like starting trying with
+ * candidates with just one node).
+ */
+static int count_cpus_per_node(libxl_cputopology *tinfo, int nr_cpus,
+                               libxl_numainfo *ninfo, int nr_nodes)
+{
+    int cpus_per_node = 0;
+    int j, i;
+
+    /* This makes sense iff # of PCPUs is the same for all nodes */
+    for (j = 0; j < nr_nodes; j++) {
+        int curr_cpus = 0;
+
+        for (i = 0; i < nr_cpus; i++) {
+            if (tinfo[i].node == j)
+                curr_cpus++;
+        }
+        /* So, if the above does not hold, turn the whole thing off! */
+        cpus_per_node = cpus_per_node == 0 ? curr_cpus : cpus_per_node;
+        if (cpus_per_node != curr_cpus)
+            return 0;
+    }
+    return cpus_per_node;
+}
+
+/*
+ * Looks for the placement candidates that satisfyies some specific
+ * conditions and return the best one according to the provided
+ * comparison function.
+ */
+int libxl__get_numa_candidate(libxl__gc *gc,
+                              uint32_t min_free_memkb, int min_cpus,
+                              int min_nodes, int max_nodes,
+                              libxl__numa_candidate_cmpf numa_cmpf,
+                              libxl__numa_candidate *cndt_out,
+                              int *cndt_found)
+{
+    libxl__numa_candidate new_cndt;
+    libxl_cputopology *tinfo = NULL;
+    libxl_numainfo *ninfo = NULL;
+    int nr_nodes = 0, nr_cpus = 0;
+    libxl_bitmap nodemap;
+    int rc = 0;
+
+    libxl_bitmap_init(&nodemap);
+    libxl__numa_candidate_init(&new_cndt);
+
+    /* Get platform info and prepare the map for testing the combinations */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL)
+        return ERROR_FAIL;
+
+    /*
+     * The good thing about this solution is that it is based on heuristics
+     * (implemented in numa_cmpf() ), but we at least can evaluate it on
+     * all the possible placement candidates. That can happen because the
+     * number of nodes present in current NUMA systems is quite small.
+     * In fact, even if a sum of binomials is involved, if the system has
+     * up to 16 nodes it "only" takes 65535 steps. This is fine, as the
+     * number of nodes the biggest NUMA systems provide at the time of this
+     * writing is 8 (and it will probably continue to be so for a while).
+     * However, computanional complexity would explode on systems bigger
+     * than that, and it's really important we avoid trying to run this
+     * on monsters with 32, 64 or more nodes (if they ever pop into being).
+     * Therefore, here it comes a safety catch that disables the algorithm
+     * for the cases when it wouldn't work well.
+     */
+    if (nr_nodes > 16) {
+        /* Log we did nothing and return 0, as no real error occurred */
+        LOG(WARN, "System has %d NUMA nodes, which is too big for the "
+                  "placement algorithm to work effectively: skipping it. "
+                  "Consider manually pinning the vCPUs and/or looking at "
+                  "cpupools for manually partitioning the system.",
+                  nr_nodes);
+        *cndt_found = 0;
+        goto out;
+    }
+
+    tinfo = libxl_get_cpu_topology(CTX, &nr_cpus);
+    if (tinfo == NULL) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0);
+    if (rc)
+        goto out;
+    rc = libxl__numa_candidate_alloc(gc, &new_cndt);
+    if (rc)
+        goto out;
+
+    /*
+     * If the minimum number of NUMA nodes is not explicitly specified
+     * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes
+     * from where to start generating candidates, if possible (or just start
+     * from 1 otherwise). The maximum number of nodes should not exceed the
+     * number of existent NUMA nodes on the host, or the candidate generation
+     * won't work properly.
+     */
+    if (!min_nodes) {
+        int cpus_per_node;
+
+        cpus_per_node = count_cpus_per_node(tinfo, nr_cpus, ninfo, nr_nodes);
+        if (cpus_per_node == 0)
+            min_nodes = 1;
+        else
+            min_nodes = (min_cpus + cpus_per_node - 1) / cpus_per_node;
+    }
+    if (min_nodes > nr_nodes)
+        min_nodes = nr_nodes;
+    if (!max_nodes || max_nodes > nr_nodes)
+        max_nodes = nr_nodes;
+    if (min_nodes > max_nodes) {
+        LOG(ERROR, "Inconsistent minimum or maximum number of guest nodes");
+        rc = ERROR_INVAL;
+        goto out;
+    }
+
+    /* This is up to the caller to be disposed */
+    rc = libxl__numa_candidate_alloc(gc, cndt_out);
+    if (rc)
+        goto out;
+
+    /*
+     * Consider all the combinations with sizes in [min_nodes, max_nodes]
+     * (see comb_init() and comb_next()). Note that, since the fewer the
+     * number of nodes the better, it is guaranteed that any candidate
+     * found during the i-eth step will be better than any other one we
+     * could find during the (i+1)-eth and all the subsequent steps (they
+     * all will have more nodes). It's thus pointless to keep going if
+     * we already found something.
+     */
+    *cndt_found = 0;
+    while (min_nodes <= max_nodes && *cndt_found == 0) {
+        comb_iter_t comb_iter;
+        int comb_ok;
+
+        /*
+         * And here it is. Each step of this cycle generates a combination of
+         * nodes as big as min_nodes mandates.  Each of these combinations is
+         * checked against the constraints provided by the caller (namely,
+         * amount of free memory and number of cpus) and it can concur to
+         * become our best placement iff it passes the check.
+         */
+        for (comb_ok = comb_init(gc, &comb_iter, nr_nodes, min_nodes); comb_ok;
+             comb_ok = comb_next(comb_iter, nr_nodes, min_nodes)) {
+            uint32_t nodes_free_memkb;
+            int nodes_cpus;
+
+            comb_get_nodemap(comb_iter, &nodemap, min_nodes);
+
+            /* If there is not enough memory in this combination, skip it
+             * and go generating the next one... */
+            nodes_free_memkb = nodemap_to_free_memkb(ninfo, &nodemap);
+            if (min_free_memkb && nodes_free_memkb < min_free_memkb)
+                continue;
+
+            /* And the same applies if this combination is short in cpus */
+            nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, &nodemap);
+            if (min_cpus && nodes_cpus < min_cpus)
+                continue;
+
+            /*
+             * Conditions are met, we can compare this candidate with the
+             * current best one (if any).
+             */
+            libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap);
+            new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap);
+            new_cndt.free_memkb = nodes_free_memkb;
+            new_cndt.nr_nodes = min_nodes;
+            new_cndt.nr_cpus = nodes_cpus;
+
+            /*
+             * Check if the new candidate we is better the what we found up
+             * to now by means of the comparison function. If no comparison
+             * function is provided, just return as soon as we find our first
+             * candidate.
+             */
+            if (*cndt_found == 0 || numa_cmpf(&new_cndt, cndt_out) < 0) {
+                *cndt_found = 1;
+
+                LOG(DEBUG, "New best NUMA placement candidate found: "
+                           "nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, "
+                           "free_memkb=%"PRIu32"", min_nodes, new_cndt.nr_cpus,
+                           new_cndt.nr_vcpus, new_cndt.free_memkb / 1024);
+
+                libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap);
+                cndt_out->nr_vcpus = new_cndt.nr_vcpus;
+                cndt_out->free_memkb = new_cndt.free_memkb;
+                cndt_out->nr_nodes = new_cndt.nr_nodes;
+                cndt_out->nr_cpus = new_cndt.nr_cpus;
+
+                if (numa_cmpf == NULL)
+                    break;
+            }
+        }
+        min_nodes++;
+    }
+
+    if (*cndt_found == 0)
+        LOG(NOTICE, "NUMA placement failed, performance might be affected");
+
+ out:
+    libxl_bitmap_dispose(&nodemap);
+    libxl__numa_candidate_dispose(&new_cndt);
+    libxl_numainfo_list_free(ninfo, nr_nodes);
+    libxl_cputopology_list_free(tinfo, nr_cpus);
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
author	Dario Faggioli <dario.faggioli@citrix.com>	2012-07-26 15:41:54 +0100
committer	Dario Faggioli <dario.faggioli@citrix.com>	2012-07-26 15:41:54 +0100
commit	e98fee8390936477d369b9832ff9bb06594510c7 (patch)
tree	485737b4c7bd3a9933296f3f52e6593f7b4f7570 /tools/libxl/libxl_numa.c
parent	47ce0e3550a2d1ce8407e3643ceb595657ac4d79 (diff)
download	xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.gz xen-e98fee8390936477d369b9832ff9bb06594510c7.tar.bz2 xen-e98fee8390936477d369b9832ff9bb06594510c7.zip