diff options
-rw-r--r-- | tools/flask/policy/policy/mls | 4 | ||||
-rw-r--r-- | tools/flask/policy/policy/modules/xen/xen.if | 6 | ||||
-rw-r--r-- | tools/flask/policy/policy/modules/xen/xen.te | 2 | ||||
-rw-r--r-- | xen/common/domain.c | 60 | ||||
-rw-r--r-- | xen/common/domctl.c | 17 | ||||
-rw-r--r-- | xen/common/keyhandler.c | 11 | ||||
-rw-r--r-- | xen/common/sched_credit.c | 48 | ||||
-rw-r--r-- | xen/common/schedule.c | 5 | ||||
-rw-r--r-- | xen/include/public/domctl.h | 13 | ||||
-rw-r--r-- | xen/include/xen/nodemask.h | 14 | ||||
-rw-r--r-- | xen/include/xen/sched-if.h | 2 | ||||
-rw-r--r-- | xen/include/xen/sched.h | 9 | ||||
-rw-r--r-- | xen/xsm/flask/hooks.c | 6 | ||||
-rw-r--r-- | xen/xsm/flask/policy/access_vectors | 6 |
14 files changed, 182 insertions, 21 deletions
diff --git a/tools/flask/policy/policy/mls b/tools/flask/policy/policy/mls index a3dde706b4..9290a76ae9 100644 --- a/tools/flask/policy/policy/mls +++ b/tools/flask/policy/policy/mls @@ -70,11 +70,11 @@ mlsconstrain domain transition (( h1 dom h2 ) and (( l1 eq l2 ) or (t1 == mls_priv))); # all the domain "read" ops -mlsconstrain domain { getvcpuaffinity getdomaininfo getvcpuinfo getvcpucontext getaddrsize getextvcpucontext } +mlsconstrain domain { getaffinity getdomaininfo getvcpuinfo getvcpucontext getaddrsize getextvcpucontext } ((l1 dom l2) or (t1 == mls_priv)); # all the domain "write" ops -mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus destroy setvcpuaffinity scheduler setdomainmaxmem setdomainhandle setdebugging hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext } +mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus destroy setaffinity scheduler setdomainmaxmem setdomainhandle setdebugging hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext } ((l1 eq l2) or (t1 == mls_priv)); # This is incomplete - similar constraints must be written for all classes diff --git a/tools/flask/policy/policy/modules/xen/xen.if b/tools/flask/policy/policy/modules/xen/xen.if index 2ce22125c1..18647c9327 100644 --- a/tools/flask/policy/policy/modules/xen/xen.if +++ b/tools/flask/policy/policy/modules/xen/xen.if @@ -48,7 +48,7 @@ define(`create_domain_common', ` allow $1 $2:domain { create max_vcpus setdomainmaxmem setaddrsize getdomaininfo hypercall setvcpucontext setextvcpucontext getscheduler getvcpuinfo getvcpuextstate getaddrsize - getvcpuaffinity setvcpuaffinity }; + getaffinity setaffinity }; allow $1 $2:domain2 { set_cpuid settsc setscheduler }; allow $1 $2:security check_context; allow $1 $2:shadow enable; @@ -77,9 +77,9 @@ define(`create_domain_build_label', ` # manage_domain(priv, target) # Allow managing a running domain define(`manage_domain', ` - allow $1 $2:domain { getdomaininfo getvcpuinfo getvcpuaffinity + allow $1 $2:domain { getdomaininfo getvcpuinfo getaffinity getaddrsize pause unpause trigger shutdown destroy - setvcpuaffinity setdomainmaxmem getscheduler }; + setaffinity setdomainmaxmem getscheduler }; ') # migrate_domain_out(priv, target) diff --git a/tools/flask/policy/policy/modules/xen/xen.te b/tools/flask/policy/policy/modules/xen/xen.te index 454e27ec71..c89ce28765 100644 --- a/tools/flask/policy/policy/modules/xen/xen.te +++ b/tools/flask/policy/policy/modules/xen/xen.te @@ -69,7 +69,7 @@ allow dom0_t xen_t:mmu memorymap; # Allow dom0 to use these domctls on itself. For domctls acting on other # domains, see the definitions of create_domain and manage_domain. allow dom0_t dom0_t:domain { - setvcpucontext max_vcpus setvcpuaffinity getvcpuaffinity getscheduler + setvcpucontext max_vcpus setaffinity getaffinity getscheduler getdomaininfo getvcpuinfo getvcpucontext setdomainmaxmem setdomainhandle setdebugging hypercall settime setaddrsize getaddrsize trigger getextvcpucontext setextvcpucontext getvcpuextstate setvcpuextstate diff --git a/xen/common/domain.c b/xen/common/domain.c index 590548e101..ce45d66b45 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -224,6 +224,7 @@ struct domain *domain_create( spin_lock_init(&d->node_affinity_lock); d->node_affinity = NODE_MASK_ALL; + d->auto_node_affinity = 1; spin_lock_init(&d->shutdown_lock); d->shutdown_code = -1; @@ -364,11 +365,32 @@ void domain_update_node_affinity(struct domain *d) cpumask_or(cpumask, cpumask, online_affinity); } - for_each_online_node ( node ) - if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) - node_set(node, nodemask); + if ( d->auto_node_affinity ) + { + /* Node-affinity is automaically computed from all vcpu-affinities */ + for_each_online_node ( node ) + if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) + node_set(node, nodemask); + + d->node_affinity = nodemask; + } + else + { + /* Node-affinity is provided by someone else, just filter out cpus + * that are either offline or not in the affinity of any vcpus. */ + nodemask = d->node_affinity; + for_each_node_mask ( node, d->node_affinity ) + if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) ) + node_clear(node, nodemask);//d->node_affinity); + + /* Avoid loosing track of node-affinity because of a bad + * vcpu-affinity has been specified. */ + if ( !nodes_empty(nodemask) ) + d->node_affinity = nodemask; + } + + sched_set_node_affinity(d, &d->node_affinity); - d->node_affinity = nodemask; spin_unlock(&d->node_affinity_lock); free_cpumask_var(online_affinity); @@ -376,6 +398,36 @@ void domain_update_node_affinity(struct domain *d) } +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity) +{ + /* Being affine with no nodes is just wrong */ + if ( nodes_empty(*affinity) ) + return -EINVAL; + + spin_lock(&d->node_affinity_lock); + + /* + * Being/becoming explicitly affine to all nodes is not particularly + * useful. Let's take it as the `reset node affinity` command. + */ + if ( nodes_full(*affinity) ) + { + d->auto_node_affinity = 1; + goto out; + } + + d->auto_node_affinity = 0; + d->node_affinity = *affinity; + +out: + spin_unlock(&d->node_affinity_lock); + + domain_update_node_affinity(d); + + return 0; +} + + struct domain *get_domain_by_id(domid_t dom) { struct domain *d; diff --git a/xen/common/domctl.c b/xen/common/domctl.c index 37532c25c6..6bd8efdf06 100644 --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -560,6 +560,23 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) } break; + case XEN_DOMCTL_setnodeaffinity: + { + nodemask_t new_affinity; + + ret = xenctl_bitmap_to_nodemask(&new_affinity, + &op->u.nodeaffinity.nodemap); + if ( !ret ) + ret = domain_set_node_affinity(d, &new_affinity); + } + break; + case XEN_DOMCTL_getnodeaffinity: + { + ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap, + &d->node_affinity); + } + break; + case XEN_DOMCTL_setvcpuaffinity: case XEN_DOMCTL_getvcpuaffinity: { diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index e9ef45f462..507213360c 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -218,6 +218,14 @@ static void cpuset_print(char *set, int size, const cpumask_t *mask) *set++ = '\0'; } +static void nodeset_print(char *set, int size, const nodemask_t *mask) +{ + *set++ = '['; + set += nodelist_scnprintf(set, size-2, mask); + *set++ = ']'; + *set++ = '\0'; +} + static void periodic_timer_print(char *str, int size, uint64_t period) { if ( period == 0 ) @@ -273,6 +281,9 @@ static void dump_domains(unsigned char key) dump_pageframe_info(d); + nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity); + printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr); + printk("VCPU information and callbacks for domain %u:\n", d->domain_id); for_each_vcpu ( d, v ) diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c index be0b4b0c6c..318831054d 100644 --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -261,17 +261,50 @@ __runq_remove(struct csched_vcpu *svc) list_del_init(&svc->runq_elem); } +/* + * Translates node-affinity mask into a cpumask, so that we can use it during + * actual scheduling. That of course will contain all the cpus from all the + * set nodes in the original node-affinity mask. + * + * Note that any serialization needed to access mask safely is complete + * responsibility of the caller of this function/hook. + */ +static void csched_set_node_affinity( + const struct scheduler *ops, + struct domain *d, + nodemask_t *mask) +{ + struct csched_dom *sdom; + int node; + + /* Skip idle domain since it doesn't even have a node_affinity_cpumask */ + if ( unlikely(is_idle_domain(d)) ) + return; + + sdom = CSCHED_DOM(d); + cpumask_clear(sdom->node_affinity_cpumask); + for_each_node_mask( node, *mask ) + cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask, + &node_to_cpumask(node)); +} + #define for_each_csched_balance_step(step) \ for ( (step) = 0; (step) <= CSCHED_BALANCE_CPU_AFFINITY; (step)++ ) /* * vcpu-affinity balancing is always necessary and must never be skipped. - * OTOH, if a domain's node-affinity spans all the nodes, we can safely - * avoid dealing with node-affinity entirely. + * OTOH, if a domain's node-affinity is said to be automatically computed + * (or if it just spans all the nodes), we can safely avoid dealing with + * node-affinity entirely. Ah, node-affinity is also deemed meaningless + * in case it has empty intersection with the vcpu's vcpu-affinity, as it + * would mean trying to schedule it on _no_ pcpu! */ -#define __vcpu_has_node_affinity(vc) \ - ( !cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) ) +#define __vcpu_has_node_affinity(vc) \ + ( !(cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ + || !cpumask_intersects(vc->cpu_affinity, \ + CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ + || vc->domain->auto_node_affinity == 1) ) /* * Each csched-balance step uses its own cpumask. This function determines @@ -284,8 +317,13 @@ static void csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask) { if ( step == CSCHED_BALANCE_NODE_AFFINITY ) + { cpumask_and(mask, CSCHED_DOM(vc->domain)->node_affinity_cpumask, vc->cpu_affinity); + + if ( unlikely(cpumask_empty(mask)) ) + cpumask_copy(mask, vc->cpu_affinity); + } else /* step == CSCHED_BALANCE_CPU_AFFINITY */ cpumask_copy(mask, vc->cpu_affinity); } @@ -1898,6 +1936,8 @@ const struct scheduler sched_credit_def = { .adjust = csched_dom_cntl, .adjust_global = csched_sys_cntl, + .set_node_affinity = csched_set_node_affinity, + .pick_cpu = csched_cpu_pick, .do_schedule = csched_schedule, diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 7364ff8a7d..c1cd3d0f15 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -638,6 +638,11 @@ int cpu_disable_scheduler(unsigned int cpu) return ret; } +void sched_set_node_affinity(struct domain *d, nodemask_t *mask) +{ + SCHED_OP(DOM2OP(d), set_node_affinity, d, mask); +} + int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity) { cpumask_t online_affinity; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index 9f5ed48a0e..4c5b2bbbbd 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -280,6 +280,16 @@ typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t); +/* Get/set the NUMA node(s) with which the guest has affinity with. */ +/* XEN_DOMCTL_setnodeaffinity */ +/* XEN_DOMCTL_getnodeaffinity */ +struct xen_domctl_nodeaffinity { + struct xenctl_bitmap nodemap;/* IN */ +}; +typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t); + + /* Get/set which physical cpus a vcpu can execute on. */ /* XEN_DOMCTL_setvcpuaffinity */ /* XEN_DOMCTL_getvcpuaffinity */ @@ -908,6 +918,8 @@ struct xen_domctl { #define XEN_DOMCTL_audit_p2m 65 #define XEN_DOMCTL_set_virq_handler 66 #define XEN_DOMCTL_set_broken_page_p2m 67 +#define XEN_DOMCTL_setnodeaffinity 68 +#define XEN_DOMCTL_getnodeaffinity 69 #define XEN_DOMCTL_gdbsx_guestmemio 1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 @@ -921,6 +933,7 @@ struct xen_domctl { struct xen_domctl_getpageframeinfo getpageframeinfo; struct xen_domctl_getpageframeinfo2 getpageframeinfo2; struct xen_domctl_getpageframeinfo3 getpageframeinfo3; + struct xen_domctl_nodeaffinity nodeaffinity; struct xen_domctl_vcpuaffinity vcpuaffinity; struct xen_domctl_shadow_op shadow_op; struct xen_domctl_max_mem max_mem; diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h index 1b860165ce..2a90dc1aef 100644 --- a/xen/include/xen/nodemask.h +++ b/xen/include/xen/nodemask.h @@ -8,8 +8,9 @@ * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * - * For details of nodemask_scnprintf() and nodemask_parse(), - * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. + * For details of nodemask_scnprintf(), nodelist_scnpintf() and + * nodemask_parse(), see bitmap_scnprintf() and bitmap_parse() + * in lib/bitmap.c. * * The available nodemask operations are: * @@ -50,6 +51,7 @@ * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing + * int nodelist_scnprintf(buf, len, mask) Format nodemask as a list for printing * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask * * for_each_node_mask(node, mask) for-loop node over mask @@ -292,6 +294,14 @@ static inline int __cycle_node(int n, const nodemask_t *maskp, int nbits) #define nodes_addr(src) ((src).bits) +#define nodelist_scnprintf(buf, len, src) \ + __nodelist_scnprintf((buf), (len), (src), MAX_NUMNODES) +static inline int __nodelist_scnprintf(char *buf, int len, + const nodemask_t *srcp, int nbits) +{ + return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); +} + #if 0 #define nodemask_scnprintf(buf, len, src) \ __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h index 9ace22cb66..2023ea393b 100644 --- a/xen/include/xen/sched-if.h +++ b/xen/include/xen/sched-if.h @@ -184,6 +184,8 @@ struct scheduler { struct xen_domctl_scheduler_op *); int (*adjust_global) (const struct scheduler *, struct xen_sysctl_scheduler_op *); + void (*set_node_affinity) (const struct scheduler *, + struct domain *, nodemask_t *); void (*dump_settings) (const struct scheduler *); void (*dump_cpu_state) (const struct scheduler *, int); diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index d15d567c9b..ad971d22fe 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -288,6 +288,8 @@ struct domain /* Does this guest need iommu mappings? */ bool_t need_iommu; #endif + /* is node-affinity automatically computed? */ + bool_t auto_node_affinity; /* Is this guest fully privileged (aka dom0)? */ bool_t is_privileged; /* Which guest this guest has privileges on */ @@ -365,7 +367,10 @@ struct domain /* Various mem_events */ struct mem_event_per_domain *mem_event; - /* Currently computed from union of all vcpu cpu-affinity masks. */ + /* + * Can be specified by the user. If that is not the case, it is + * computed from the union of all the vcpu cpu-affinity masks. + */ nodemask_t node_affinity; unsigned int last_alloc_node; spinlock_t node_affinity_lock; @@ -435,6 +440,7 @@ static inline void get_knownalive_domain(struct domain *d) ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); } +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); void domain_update_node_affinity(struct domain *d); struct domain *domain_create( @@ -555,6 +561,7 @@ void sched_destroy_domain(struct domain *d); int sched_move_domain(struct domain *d, struct cpupool *c); long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *); long sched_adjust_global(struct xen_sysctl_scheduler_op *); +void sched_set_node_affinity(struct domain *, nodemask_t *); int sched_id(void); void sched_tick_suspend(void); void sched_tick_resume(void); diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c index 29a78dd06e..247c8a393b 100644 --- a/xen/xsm/flask/hooks.c +++ b/xen/xsm/flask/hooks.c @@ -611,10 +611,12 @@ static int flask_domctl(struct domain *d, int cmd) return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__UNPAUSE); case XEN_DOMCTL_setvcpuaffinity: - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY); + case XEN_DOMCTL_setnodeaffinity: + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETAFFINITY); case XEN_DOMCTL_getvcpuaffinity: - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY); + case XEN_DOMCTL_getnodeaffinity: + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETAFFINITY); case XEN_DOMCTL_resumedomain: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__RESUME); diff --git a/xen/xsm/flask/policy/access_vectors b/xen/xsm/flask/policy/access_vectors index 36cbacfa13..fdfc50245a 100644 --- a/xen/xsm/flask/policy/access_vectors +++ b/xen/xsm/flask/policy/access_vectors @@ -104,9 +104,11 @@ class domain # XEN_DOMCTL_destroydomain destroy # XEN_DOMCTL_setvcpuaffinity - setvcpuaffinity +# XEN_DOMCTL_setnodeaffinity + setaffinity # XEN_DOMCTL_getvcpuaffinity - getvcpuaffinity +# XEN_DOMCTL_getnodeaffinity + getaffinity # XEN_DOMCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo getscheduler # XEN_DOMCTL_getdomaininfo, XEN_SYSCTL_getdomaininfolist |