diff options
Diffstat (limited to 'tools/libxl/libxl_dom.c')
-rw-r--r-- | tools/libxl/libxl_dom.c | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c index bd4c9b448d..e13fb49f7f 100644 --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -98,6 +98,94 @@ out: return sched; } +/* + * Two NUMA placement candidates are compared by means of the following + * heuristics: + + * - the number of vcpus runnable on the candidates is considered, and + * candidates with fewer of them are preferred. If two candidate have + * the same number of runnable vcpus, + * - the amount of free memory in the candidates is considered, and the + * candidate with greater amount of it is preferred. + * + * In fact, leaving larger memory holes, maximizes the probability of being + * able to put other domains on the node. That hopefully means many domains + * will benefit from local memory accesses, but also introduces the risk of + * overloading large (from a memory POV) nodes. That's right the effect + * that counting the vcpus able to run on the nodes tries to prevent. + * + * Note that this completely ignore the number of nodes each candidate span, + * as the fact that fewer nodes is better is already accounted for in the + * algorithm. + */ +static int numa_cmpf(const libxl__numa_candidate *c1, + const libxl__numa_candidate *c2) +{ + if (c1->nr_vcpus != c2->nr_vcpus) + return c1->nr_vcpus - c2->nr_vcpus; + + return c2->free_memkb - c1->free_memkb; +} + +/* The actual automatic NUMA placement routine */ +static int numa_place_domain(libxl__gc *gc, libxl_domain_build_info *info) +{ + int found; + libxl__numa_candidate candidate; + libxl_bitmap candidate_nodemap; + libxl_cpupoolinfo *pinfo; + int nr_pools, rc = 0; + uint32_t memkb; + + libxl__numa_candidate_init(&candidate); + libxl_bitmap_init(&candidate_nodemap); + + /* First of all, if cpupools are in use, better not to mess with them */ + pinfo = libxl_list_cpupool(CTX, &nr_pools); + if (!pinfo) + return ERROR_FAIL; + if (nr_pools > 1) { + LOG(NOTICE, "Skipping NUMA placement as cpupools are in use"); + goto out; + } + + rc = libxl_domain_need_memory(CTX, info, &memkb); + if (rc) + goto out; + if (libxl_node_bitmap_alloc(CTX, &candidate_nodemap, 0)) { + rc = ERROR_FAIL; + goto out; + } + + /* Find the best candidate with enough free memory and at least + * as much pcpus as the domain has vcpus. */ + rc = libxl__get_numa_candidate(gc, memkb, info->max_vcpus, 0, 0, + numa_cmpf, &candidate, &found); + if (rc) + goto out; + + /* Not even a suitable placement candidate! Let's just don't touch the + * domain's info->cpumap. It will have affinity with all nodes/cpus. */ + if (found == 0) + goto out; + + /* Map the candidate's node map to the domain's info->cpumap */ + libxl__numa_candidate_get_nodemap(gc, &candidate, &candidate_nodemap); + rc = libxl_nodemap_to_cpumap(CTX, &candidate_nodemap, &info->cpumap); + if (rc) + goto out; + + LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and " + "%"PRIu32" KB free selected", candidate.nr_nodes, + candidate.nr_cpus, candidate.free_memkb / 1024); + + out: + libxl__numa_candidate_dispose(&candidate); + libxl_bitmap_dispose(&candidate_nodemap); + libxl_cpupoolinfo_list_free(pinfo, nr_pools); + return rc; +} + int libxl__build_pre(libxl__gc *gc, uint32_t domid, libxl_domain_build_info *info, libxl__domain_build_state *state) { @@ -107,7 +195,31 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, uint32_t rtc_timeoffset; xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus); + + /* + * Check if the domain has any CPU affinity. If not, try to build + * up one. In case numa_place_domain() find at least a suitable + * candidate, it will affect info->cpumap accordingly; if it + * does not, it just leaves it as it is. This means (unless + * some weird error manifests) the subsequent call to + * libxl_set_vcpuaffinity_all() will do the actual placement, + * whatever that turns out to be. + */ + if (libxl_defbool_val(info->numa_placement)) { + int rc; + + if (!libxl_bitmap_is_full(&info->cpumap)) { + LOG(ERROR, "Can run NUMA placement only if no vcpu " + "affinity is specified"); + return ERROR_INVAL; + } + + rc = numa_place_domain(gc, info); + if (rc) + return rc; + } libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap); + xc_domain_setmaxmem(ctx->xch, domid, info->target_memkb + LIBXL_MAXMEM_CONSTANT); if (info->type == LIBXL_DOMAIN_TYPE_PV) xc_domain_set_memmap_limit(ctx->xch, domid, |