aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xen/common/domain.c1
-rw-r--r--xen/common/domctl.c1
-rw-r--r--xen/common/memory.c33
-rw-r--r--xen/common/page_alloc.c116
-rw-r--r--xen/include/public/domctl.h3
-rw-r--r--xen/include/public/memory.h35
-rw-r--r--xen/include/xen/mm.h3
-rw-r--r--xen/include/xen/sched.h1
8 files changed, 188 insertions, 5 deletions
diff --git a/xen/common/domain.c b/xen/common/domain.c
index b360de1fdc..64ee29d62b 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -507,6 +507,7 @@ int domain_kill(struct domain *d)
evtchn_destroy(d);
gnttab_release_mappings(d);
tmem_destroy(d->tmem);
+ domain_set_outstanding_pages(d, 0);
d->tmem = NULL;
/* fallthrough */
case DOMDYING_dying:
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index b7f6619d27..c98e99c9e7 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -154,6 +154,7 @@ void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info)
info->tot_pages = d->tot_pages;
info->max_pages = d->max_pages;
+ info->outstanding_pages = d->outstanding_pages;
info->shr_pages = atomic_read(&d->shr_pages);
info->paged_pages = atomic_read(&d->paged_pages);
info->shared_info_frame = mfn_to_gmfn(d, virt_to_mfn(d->shared_info));
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 3cf9e6c92c..68501d1746 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -711,6 +711,39 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
break;
}
+ case XENMEM_claim_pages:
+ if ( !IS_PRIV(current->domain) )
+ return -EPERM;
+
+ if ( copy_from_guest(&reservation, arg, 1) )
+ return -EFAULT;
+
+ if ( !guest_handle_is_null(reservation.extent_start) )
+ return -EINVAL;
+
+ if ( reservation.extent_order != 0 )
+ return -EINVAL;
+
+ if ( reservation.mem_flags != 0 )
+ return -EINVAL;
+
+ d = rcu_lock_domain_by_id(reservation.domid);
+ if ( d == NULL )
+ return -EINVAL;
+
+ rc = domain_set_outstanding_pages(d, reservation.nr_extents);
+
+ rcu_unlock_domain(d);
+
+ break;
+
+ case XENMEM_get_outstanding_pages:
+ if ( !IS_PRIV(current->domain) )
+ return -EPERM;
+
+ rc = get_outstanding_claims();
+ break;
+
default:
rc = arch_memory_op(op, arg);
break;
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 9e9fb159ca..aefef29790 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -252,11 +252,114 @@ static long midsize_alloc_zone_pages;
#define MIDSIZE_ALLOC_FRAC 128
static DEFINE_SPINLOCK(heap_lock);
+static long outstanding_claims; /* total outstanding claims by all domains */
unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
{
+ long dom_before, dom_after, dom_claimed, sys_before, sys_after;
+
ASSERT(spin_is_locked(&d->page_alloc_lock));
- return d->tot_pages += pages;
+ d->tot_pages += pages;
+
+ /*
+ * can test d->claimed_pages race-free because it can only change
+ * if d->page_alloc_lock and heap_lock are both held, see also
+ * domain_set_outstanding_pages below
+ */
+ if ( !d->outstanding_pages )
+ goto out;
+
+ spin_lock(&heap_lock);
+ /* adjust domain outstanding pages; may not go negative */
+ dom_before = d->outstanding_pages;
+ dom_after = dom_before - pages;
+ BUG_ON(dom_before < 0);
+ dom_claimed = dom_after < 0 ? 0 : dom_after;
+ d->outstanding_pages = dom_claimed;
+ /* flag accounting bug if system outstanding_claims would go negative */
+ sys_before = outstanding_claims;
+ sys_after = sys_before - (dom_before - dom_claimed);
+ BUG_ON(sys_after < 0);
+ outstanding_claims = sys_after;
+ spin_unlock(&heap_lock);
+
+out:
+ return d->tot_pages;
+}
+
+int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
+{
+ int ret = -ENOMEM;
+ unsigned long claim, avail_pages;
+
+ /*
+ * take the domain's page_alloc_lock, else all d->tot_page adjustments
+ * must always take the global heap_lock rather than only in the much
+ * rarer case that d->outstanding_pages is non-zero
+ */
+ spin_lock(&d->page_alloc_lock);
+ spin_lock(&heap_lock);
+
+ /* pages==0 means "unset" the claim. */
+ if ( pages == 0 )
+ {
+ outstanding_claims -= d->outstanding_pages;
+ d->outstanding_pages = 0;
+ ret = 0;
+ goto out;
+ }
+
+ /* only one active claim per domain please */
+ if ( d->outstanding_pages )
+ {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* disallow a claim not exceeding current tot_pages or above max_pages */
+ if ( (pages <= d->tot_pages) || (pages > d->max_pages) )
+ {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* how much memory is available? */
+ avail_pages = total_avail_pages;
+
+ /* Note: The usage of claim means that allocation from a guest *might*
+ * have to come from freeable memory. Using free memory is always better, if
+ * it is available, than using freeable memory.
+ *
+ * But that is OK as once the claim has been made, it still can take minutes
+ * before the claim is fully satisfied. Tmem can make use of the unclaimed
+ * pages during this time (to store ephemeral/freeable pages only,
+ * not persistent pages).
+ */
+ avail_pages += tmem_freeable_pages();
+ avail_pages -= outstanding_claims;
+
+ /*
+ * Note, if domain has already allocated memory before making a claim
+ * then the claim must take tot_pages into account
+ */
+ claim = pages - d->tot_pages;
+ if ( claim > avail_pages )
+ goto out;
+
+ /* yay, claim fits in available memory, stake the claim, success! */
+ d->outstanding_pages = claim;
+ outstanding_claims += d->outstanding_pages;
+ ret = 0;
+
+out:
+ spin_unlock(&heap_lock);
+ spin_unlock(&d->page_alloc_lock);
+ return ret;
+}
+
+long get_outstanding_claims(void)
+{
+ return outstanding_claims;
}
static unsigned long init_node_heap(int node, unsigned long mfn,
@@ -397,7 +500,7 @@ static void __init setup_low_mem_virq(void)
static void check_low_mem_virq(void)
{
unsigned long avail_pages = total_avail_pages +
- (opt_tmem ? tmem_freeable_pages() : 0);
+ (opt_tmem ? tmem_freeable_pages() : 0) - outstanding_claims;
if ( unlikely(avail_pages <= low_mem_virq_th) )
{
@@ -466,6 +569,15 @@ static struct page_info *alloc_heap_pages(
spin_lock(&heap_lock);
/*
+ * Claimed memory is considered unavailable unless the request
+ * is made by a domain with sufficient unclaimed pages.
+ */
+ if ( (outstanding_claims + request >
+ total_avail_pages + tmem_freeable_pages()) &&
+ (d == NULL || d->outstanding_pages < request) )
+ goto not_found;
+
+ /*
* TMEM: When available memory is scarce due to tmem absorbing it, allow
* only mid-size allocations to avoid worst of fragmentation issues.
* Others try tmem pools then fail. This is a workaround until all
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index deb19db6e1..113b8dc498 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -36,7 +36,7 @@
#include "grant_table.h"
#include "hvm/save.h"
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000008
+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000009
/*
* NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -95,6 +95,7 @@ struct xen_domctl_getdomaininfo {
uint32_t flags; /* XEN_DOMINF_* */
uint64_aligned_t tot_pages;
uint64_aligned_t max_pages;
+ uint64_aligned_t outstanding_pages;
uint64_aligned_t shr_pages;
uint64_aligned_t paged_pages;
uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 1c5ca19da8..51d5254933 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -68,6 +68,8 @@ struct xen_memory_reservation {
* IN: GPFN bases of extents to populate with memory
* OUT: GMFN bases of extents that were allocated
* (NB. This command also updates the mach_to_phys translation table)
+ * XENMEM_claim_pages:
+ * IN: must be zero
*/
XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
@@ -430,10 +432,39 @@ typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;
DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
/*
- * Reserve ops for future/out-of-tree "claim" patches (Oracle)
+ * Attempt to stake a claim for a domain on a quantity of pages
+ * of system RAM, but _not_ assign specific pageframes. Only
+ * arithmetic is performed so the hypercall is very fast and need
+ * not be preemptible, thus sidestepping time-of-check-time-of-use
+ * races for memory allocation. Returns 0 if the hypervisor page
+ * allocator has atomically and successfully claimed the requested
+ * number of pages, else non-zero.
+ *
+ * Any domain may have only one active claim. When sufficient memory
+ * has been allocated to resolve the claim, the claim silently expires.
+ * Claiming zero pages effectively resets any outstanding claim and
+ * is always successful.
+ *
+ * Note that a valid claim may be staked even after memory has been
+ * allocated for a domain. In this case, the claim is not incremental,
+ * i.e. if the domain's tot_pages is 3, and a claim is staked for 10,
+ * only 7 additional pages are claimed.
+ *
+ * Caller must be privileged or the hypercall fails.
*/
#define XENMEM_claim_pages 24
-#define XENMEM_get_unclaimed_pages 25
+
+/*
+ * XENMEM_claim_pages flags - the are no flags at this time.
+ * The zero value is appropiate.
+ */
+
+/*
+ * Get the number of pages currently claimed (but not yet "possessed")
+ * across all domains. The caller must be privileged but otherwise
+ * the call never fails.
+ */
+#define XENMEM_get_outstanding_pages 25
#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 2f701f5d08..28512fb8bb 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -49,7 +49,10 @@ void free_xenheap_pages(void *v, unsigned int order);
#define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
#define free_xenheap_page(v) (free_xenheap_pages(v,0))
+/* Claim handling */
unsigned long domain_adjust_tot_pages(struct domain *d, long pages);
+int domain_set_outstanding_pages(struct domain *d, unsigned long pages);
+long get_outstanding_claims(void);
/* Domain suballocator. These functions are *not* interrupt-safe.*/
void init_domheap_pages(paddr_t ps, paddr_t pe);
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index e108436d38..569e76eac9 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -242,6 +242,7 @@ struct domain
struct page_list_head page_list; /* linked list */
struct page_list_head xenpage_list; /* linked list (size xenheap_pages) */
unsigned int tot_pages; /* number of pages currently possesed */
+ unsigned int outstanding_pages; /* pages claimed but not possessed */
unsigned int max_pages; /* maximum value for tot_pages */
atomic_t shr_pages; /* number of shared pages */
atomic_t paged_pages; /* number of paged-out pages */