/* * Copyright (C) 2012 Citrix Ltd. * Author Dario Faggioli * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; version 2.1 only. with the special * exception on linking described in file LICENSE. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. */ #include "libxl_osdeps.h" /* must come before any other headers */ #include #include "libxl_internal.h" /* * What follows are helpers for generating all the k-combinations * without repetitions of a set S with n elements in it. Formally * speaking, they are subsets of k distinct elements of S and, if * S is n elements big, the number of k-combinations is equal to the * binomial coefficient C(n k)=n!/(k! * (n - k)!). * * The various subset are generated one after the other by calling * comb_init() first, and, after that, comb_next() * C(n k)-1 times. An iterator is used to store the current status * of the whole generation operation (i.e., basically, the last * combination that has been generated). As soon as all combinations * have been generated, comb_next() will start returning 0 instead of * 1. The same instance of the iterator and the same values for * n and k _must_ be used for each call (if that doesn't happen, the * result is unspecified). * * The algorithm is a well known one (see, for example, D. Knuth's "The * Art of Computer Programming - Volume 4, Fascicle 3" and it produces * the combinations in such a way that they (well, more precisely, their * indexes it the array/map representing the set) come with lexicographic * ordering. * * For example, with n = 5 and k = 3, calling comb_init() * will generate { 0, 1, 2 }, while subsequent valid calls to * comb_next() will produce the following: * { { 0, 1, 3 }, { 0, 1, 4 }, * { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 4 }, * { 1, 2, 3 }, { 1, 2, 4 }, { 1, 3, 4 }, * { 2, 3, 4 } } * * This is used by the automatic NUMA placement logic below. */ typedef int* comb_iter_t; static int comb_init(libxl__gc *gc, comb_iter_t *it, int n, int k) { comb_iter_t new_iter; int i; if (n < k) return 0; /* First set is always { 0, 1, 2, ..., k-1 } */ GCNEW_ARRAY(new_iter, k); for (i = 0; i < k; i++) new_iter[i] = i; *it = new_iter; return 1; } static int comb_next(comb_iter_t it, int n, int k) { int i; /* * The idea here is to find the leftmost element from where * we should start incrementing the indexes of the iterator. * This means looking for the highest index that can be increased * while still producing value smaller than n-1. In the example * above, when dealing with { 0, 1, 4 }, such an element is the * second one, as the third is already equal to 4 (which actually * is n-1). * Once we found from where to start, we increment that element * and override the right-hand rest of the iterator with its * successors, thus achieving lexicographic ordering. * * Regarding the termination of the generation process, when we * manage in bringing n-k at the very first position of the iterator, * we know that is the last valid combination ( { 2, 3, 4 }, with * n - k = 5 - 2 = 2, in the example above), and thus we start * returning 0 as soon as we cross that border. */ for (i = k - 1; it[i] == n - k + i; i--) { if (i <= 0) return 0; } for (it[i]++, i++; i < k; i++) it[i] = it[i - 1] + 1; return 1; } /* NUMA automatic placement (see libxl_internal.h for details) */ /* * This function turns a k-combination iterator into a node map, * given another map, telling us which nodes should be considered. * * This means the bits that are set in suitable_nodemap and that * corresponds to the indexes of the given combination are the ones * that will be set in nodemap. * * For example, given a fully set suitable_nodemap, if the iterator * represents the combination { 0, 2, 4}, nodmeap will have bits #0, * #2 and #4 set. * On the other hand, if, say, suitable_nodemap=01011011, the same * iterator will cause bits #1, #4 and #7 of nodemap to be set. */ static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *suitable_nodemap, libxl_bitmap *nodemap, int k) { int i, m = 0, n = 0; libxl_bitmap_set_none(nodemap); libxl_for_each_set_bit(i, *suitable_nodemap) { /* Check wether the n-th set bit of suitable_nodemap * matches with the m-th element of the iterator (and, * only if it does, advance to the next one) */ if (m < k && n == it[m]) { libxl_bitmap_set(nodemap, i); m++; } n++; } } /* Retrieve the number of cpus that the nodes that are part of the nodemap * span and are also set in suitable_cpumap. */ static int nodemap_to_nr_cpus(libxl_cputopology *tinfo, int nr_cpus, const libxl_bitmap *suitable_cpumap, const libxl_bitmap *nodemap) { int i, nodes_cpus = 0; for (i = 0; i < nr_cpus; i++) { if (libxl_bitmap_test(suitable_cpumap, i) && libxl_bitmap_test(nodemap, tinfo[i].node)) nodes_cpus++; } return nodes_cpus; } /* Retrieve the amount of free memory within the nodemap */ static uint32_t nodemap_to_free_memkb(libxl_numainfo *ninfo, libxl_bitmap *nodemap) { uint32_t free_memkb = 0; int i; libxl_for_each_set_bit(i, *nodemap) free_memkb += ninfo[i].free / 1024; return free_memkb; } /* Retrieve the number of vcpus able to run on the nodes in nodemap */ static int nodemap_to_nr_vcpus(libxl__gc *gc, int vcpus_on_node[], const libxl_bitmap *nodemap) { int i, nr_vcpus = 0; libxl_for_each_set_bit(i, *nodemap) nr_vcpus += vcpus_on_node[i]; return nr_vcpus; } /* Number of vcpus able to run on the cpus of the various nodes * (reported by filling the array vcpus_on_node[]). */ static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, const libxl_bitmap *suitable_cpumap, int vcpus_on_node[]) { libxl_dominfo *dinfo = NULL; libxl_bitmap dom_nodemap, nodes_counted; int nr_doms, nr_cpus; int i, j, k; dinfo = libxl_list_domain(CTX, &nr_doms); if (dinfo == NULL) return ERROR_FAIL; if (libxl_node_bitmap_alloc(CTX, &nodes_counted, 0) < 0) { libxl_dominfo_list_free(dinfo, nr_doms); return ERROR_FAIL; } if (libxl_node_bitmap_alloc(CTX, &dom_nodemap, 0) < 0) { libxl_bitmap_dispose(&nodes_counted); libxl_dominfo_list_free(dinfo, nr_doms); return ERROR_FAIL; } for (i = 0; i < nr_doms; i++) { libxl_vcpuinfo *vinfo; int nr_dom_vcpus; vinfo = libxl_list_vcpu(CTX, dinfo[i].domid, &nr_dom_vcpus, &nr_cpus); if (vinfo == NULL) continue; /* Retrieve the domain's node-affinity map */ libxl_domain_get_nodeaffinity(CTX, dinfo[i].domid, &dom_nodemap); for (j = 0; j < nr_dom_vcpus; j++) { /* * For each vcpu of each domain, it must have both vcpu-affinity * and node-affinity to (a pcpu belonging to) a certain node to * cause an increment in the corresponding element of the array. */ libxl_bitmap_set_none(&nodes_counted); libxl_for_each_set_bit(k, vinfo[j].cpumap) { int node = tinfo[k].node; if (libxl_bitmap_test(suitable_cpumap, k) && libxl_bitmap_test(&dom_nodemap, node) && !libxl_bitmap_test(&nodes_counted, node)) { libxl_bitmap_set(&nodes_counted, node); vcpus_on_node[node]++; } } } libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); } libxl_bitmap_dispose(&dom_nodemap); libxl_bitmap_dispose(&nodes_counted); libxl_dominfo_list_free(dinfo, nr_doms); return 0; } /* * This function tries to figure out if the host has a consistent number * of cpus along all its NUMA nodes. In fact, if that is the case, we can * calculate the minimum number of nodes needed for a domain by just * dividing its total number of vcpus by this value computed here. * However, we are not allowed to assume that all the nodes have the * same number of cpus. Therefore, in case discrepancies among different * nodes are found, this function just returns 0, for the caller to know * it shouldn't rely on this 'optimization', and sort out things in some * other way (by doing something basic, like starting trying with * candidates with just one node). */ static int count_cpus_per_node(libxl_cputopology *tinfo, int nr_cpus, int nr_nodes) { int cpus_per_node = 0; int j, i; /* This makes sense iff # of PCPUs is the same for all nodes */ for (j = 0; j < nr_nodes; j++) { int curr_cpus = 0; for (i = 0; i < nr_cpus; i++) { if (tinfo[i].node == j) curr_cpus++; } /* So, if the above does not hold, turn the whole thing off! */ cpus_per_node = cpus_per_node == 0 ? curr_cpus : cpus_per_node; if (cpus_per_node != curr_cpus) return 0; } return cpus_per_node; } /* * Looks for the placement candidates that satisfyies some specific * conditions and return the best one according to the provided * comparison function. */ int libxl__get_numa_candidate(libxl__gc *gc, uint32_t min_free_memkb, int min_cpus, int min_nodes, int max_nodes, const libxl_bitmap *suitable_cpumap, libxl__numa_candidate_cmpf numa_cmpf, libxl__numa_candidate *cndt_out, int *cndt_found) { libxl__numa_candidate new_cndt; libxl_cputopology *tinfo = NULL; libxl_numainfo *ninfo = NULL; int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0; libxl_bitmap suitable_nodemap, nodemap; int *vcpus_on_node, rc = 0; libxl_bitmap_init(&nodemap); libxl_bitmap_init(&suitable_nodemap); libxl__numa_candidate_init(&new_cndt); /* Get platform info and prepare the map for testing the combinations */ ninfo = libxl_get_numainfo(CTX, &nr_nodes); if (ninfo == NULL) return ERROR_FAIL; GCNEW_ARRAY(vcpus_on_node, nr_nodes); /* * The good thing about this solution is that it is based on heuristics * (implemented in numa_cmpf() ), but we at least can evaluate it on * all the possible placement candidates. That can happen because the * number of nodes present in current NUMA systems is quite small. * In fact, even if a sum of binomials is involved, if the system has * up to 16 nodes it "only" takes 65535 steps. This is fine, as the * number of nodes the biggest NUMA systems provide at the time of this * writing is 8 (and it will probably continue to be so for a while). * However, computanional complexity would explode on systems bigger * than that, and it's really important we avoid trying to run this * on monsters with 32, 64 or more nodes (if they ever pop into being). * Therefore, here it comes a safety catch that disables the algorithm * for the cases when it wouldn't work well. */ if (nr_nodes > 16) { /* Log we did nothing and return 0, as no real error occurred */ LOG(WARN, "System has %d NUMA nodes, which is too big for the " "placement algorithm to work effectively: skipping it. " "Consider manually pinning the vCPUs and/or looking at " "cpupools for manually partitioning the system.", nr_nodes); *cndt_found = 0; goto out; } tinfo = libxl_get_cpu_topology(CTX, &nr_cpus); if (tinfo == NULL) { rc = ERROR_FAIL; goto out; } rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0); if (rc) goto out; rc = libxl__numa_candidate_alloc(gc, &new_cndt); if (rc) goto out; /* Allocate and prepare the map of the node that can be utilized for * placement, basing on the map of suitable cpus. */ rc = libxl_node_bitmap_alloc(CTX, &suitable_nodemap, 0); if (rc) goto out; rc = libxl_cpumap_to_nodemap(CTX, suitable_cpumap, &suitable_nodemap); if (rc) goto out; /* * Later on, we will try to figure out how many vcpus are runnable on * each candidate (as a part of choosing the best one of them). That * requires going through all the vcpus of all the domains and check * their affinities. So, instead of doing that for each candidate, * let's count here the number of vcpus runnable on each node, so that * all we have to do later is summing up the right elements of the * vcpus_on_node array. */ rc = nr_vcpus_on_nodes(gc, tinfo, suitable_cpumap, vcpus_on_node); if (rc) goto out; /* * If the minimum number of NUMA nodes is not explicitly specified * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes * from where to start generating candidates, if possible (or just start * from 1 otherwise). The maximum number of nodes should not exceed the * number of existent NUMA nodes on the host, or the candidate generation * won't work properly. */ if (!min_nodes) { int cpus_per_node; cpus_per_node = count_cpus_per_node(tinfo, nr_cpus, nr_nodes); if (cpus_per_node == 0) min_nodes = 1; else min_nodes = (min_cpus + cpus_per_node - 1) / cpus_per_node; } /* We also need to be sure we do not exceed the number of * nodes we are allowed to use. */ nr_suit_nodes = libxl_bitmap_count_set(&suitable_nodemap); if (min_nodes > nr_suit_nodes) min_nodes = nr_suit_nodes; if (!max_nodes || max_nodes > nr_suit_nodes) max_nodes = nr_suit_nodes; if (min_nodes > max_nodes) { LOG(ERROR, "Inconsistent minimum or maximum number of guest nodes"); rc = ERROR_INVAL; goto out; } /* This is up to the caller to be disposed */ rc = libxl__numa_candidate_alloc(gc, cndt_out); if (rc) goto out; /* * Consider all the combinations with sizes in [min_nodes, max_nodes] * (see comb_init() and comb_next()). Note that, since the fewer the * number of nodes the better, it is guaranteed that any candidate * found during the i-eth step will be better than any other one we * could find during the (i+1)-eth and all the subsequent steps (they * all will have more nodes). It's thus pointless to keep going if * we already found something. */ *cndt_found = 0; while (min_nodes <= max_nodes && *cndt_found == 0) { comb_iter_t comb_iter; int comb_ok; /* * And here it is. Each step of this cycle generates a combination of * nodes as big as min_nodes mandates. Each of these combinations is * checked against the constraints provided by the caller (namely, * amount of free memory and number of cpus) and it can concur to * become our best placement iff it passes the check. */ for (comb_ok = comb_init(gc, &comb_iter, nr_suit_nodes, min_nodes); comb_ok; comb_ok = comb_next(comb_iter, nr_suit_nodes, min_nodes)) { uint32_t nodes_free_memkb; int nodes_cpus; /* Get the nodemap for the combination, only considering * suitable nodes. */ comb_get_nodemap(comb_iter, &suitable_nodemap, &nodemap, min_nodes); /* If there is not enough memory in this combination, skip it * and go generating the next one... */ nodes_free_memkb = nodemap_to_free_memkb(ninfo, &nodemap); if (min_free_memkb && nodes_free_memkb < min_free_memkb) continue; /* And the same applies if this combination is short in cpus */ nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, suitable_cpumap, &nodemap); if (min_cpus && nodes_cpus < min_cpus) continue; /* * Conditions are met, we can compare this candidate with the * current best one (if any). */ libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap); new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, vcpus_on_node, &nodemap); new_cndt.free_memkb = nodes_free_memkb; new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap); new_cndt.nr_cpus = nodes_cpus; /* * Check if the new candidate we is better the what we found up * to now by means of the comparison function. If no comparison * function is provided, just return as soon as we find our first * candidate. */ if (*cndt_found == 0 || numa_cmpf(&new_cndt, cndt_out) < 0) { *cndt_found = 1; LOG(DEBUG, "New best NUMA placement candidate found: " "nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, " "free_memkb=%"PRIu32"", new_cndt.nr_nodes, new_cndt.nr_cpus, new_cndt.nr_vcpus, new_cndt.free_memkb / 1024); libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap); cndt_out->nr_vcpus = new_cndt.nr_vcpus; cndt_out->free_memkb = new_cndt.free_memkb; cndt_out->nr_nodes = new_cndt.nr_nodes; cndt_out->nr_cpus = new_cndt.nr_cpus; if (numa_cmpf == NULL) break; } } min_nodes++; } if (*cndt_found == 0) LOG(NOTICE, "NUMA placement failed, performance might be affected"); out: libxl_bitmap_dispose(&nodemap); libxl_bitmap_dispose(&suitable_nodemap); libxl__numa_candidate_dispose(&new_cndt); libxl_numainfo_list_free(ninfo, nr_nodes); libxl_cputopology_list_free(tinfo, nr_cpus); return rc; } /* * Local variables: * mode: C * c-basic-offset: 4 * indent-tabs-mode: nil * End: */