/****************************************************************************** * arch/x86/mm/mem_sharing.c * * Memory sharing support. * * Copyright (c) 2011 GridCentric, Inc. (Adin Scannell & Andres Lagar-Cavilla) * Copyright (c) 2009 Citrix Systems, Inc. (Grzegorz Milos) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mm-locks.h" static shr_handle_t next_handle = 1; typedef struct pg_lock_data { int mm_unlock_level; unsigned short recurse_count; } pg_lock_data_t; DEFINE_PER_CPU(pg_lock_data_t, __pld); #define MEM_SHARING_DEBUG(_f, _a...) \ debugtrace_printk("mem_sharing_debug: %s(): " _f, __func__, ##_a) /* Reverse map defines */ #define RMAP_HASHTAB_ORDER 0 #define RMAP_HASHTAB_SIZE \ ((PAGE_SIZE << RMAP_HASHTAB_ORDER) / sizeof(struct list_head)) #define RMAP_USES_HASHTAB(page) \ ((page)->sharing->hash_table.flag == NULL) #define RMAP_HEAVY_SHARED_PAGE RMAP_HASHTAB_SIZE /* A bit of hysteresis. We don't want to be mutating between list and hash * table constantly. */ #define RMAP_LIGHT_SHARED_PAGE (RMAP_HEAVY_SHARED_PAGE >> 2) #if MEM_SHARING_AUDIT static struct list_head shr_audit_list; static spinlock_t shr_audit_lock; DEFINE_RCU_READ_LOCK(shr_audit_read_lock); /* RCU delayed free of audit list entry */ static void _free_pg_shared_info(struct rcu_head *head) { xfree(container_of(head, struct page_sharing_info, rcu_head)); } static inline void audit_add_list(struct page_info *page) { INIT_LIST_HEAD(&page->sharing->entry); spin_lock(&shr_audit_lock); list_add_rcu(&page->sharing->entry, &shr_audit_list); spin_unlock(&shr_audit_lock); } /* Removes from the audit list and cleans up the page sharing metadata. */ static inline void page_sharing_dispose(struct page_info *page) { /* Unlikely given our thresholds, but we should be careful. */ if ( unlikely(RMAP_USES_HASHTAB(page)) ) free_xenheap_pages(page->sharing->hash_table.bucket, RMAP_HASHTAB_ORDER); spin_lock(&shr_audit_lock); list_del_rcu(&page->sharing->entry); spin_unlock(&shr_audit_lock); INIT_RCU_HEAD(&page->sharing->rcu_head); call_rcu(&page->sharing->rcu_head, _free_pg_shared_info); } #else int mem_sharing_audit(void) { return -ENOSYS; } #define audit_add_list(p) ((void)0) static inline void page_sharing_dispose(struct page_info *page) { /* Unlikely given our thresholds, but we should be careful. */ if ( unlikely(RMAP_USES_HASHTAB(page)) ) free_xenheap_pages(page->sharing->hash_table.bucket, RMAP_HASHTAB_ORDER); xfree(page->sharing); } #endif /* MEM_SHARING_AUDIT */ static inline int mem_sharing_page_lock(struct page_info *pg) { int rc; pg_lock_data_t *pld = &(this_cpu(__pld)); page_sharing_mm_pre_lock(); rc = page_lock(pg); if ( rc ) { preempt_disable(); page_sharing_mm_post_lock(&pld->mm_unlock_level, &pld->recurse_count); } return rc; } static inline void mem_sharing_page_unlock(struct page_info *pg) { pg_lock_data_t *pld = &(this_cpu(__pld)); page_sharing_mm_unlock(pld->mm_unlock_level, &pld->recurse_count); preempt_enable(); page_unlock(pg); } static inline shr_handle_t get_next_handle(void) { /* Get the next handle get_page style */ uint64_t x, y = next_handle; do { x = y; } while ( (y = cmpxchg(&next_handle, x, x + 1)) != x ); return x + 1; } #define mem_sharing_enabled(d) \ (is_hvm_domain(d) && (d)->arch.hvm_domain.mem_sharing_enabled) #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) static atomic_t nr_saved_mfns = ATOMIC_INIT(0); static atomic_t nr_shared_mfns = ATOMIC_INIT(0); /** Reverse map **/ /* Every shared frame keeps a reverse map (rmap) of tuples that * this shared frame backs. For pages with a low degree of sharing, a O(n) * search linked list is good enough. For pages with higher degree of sharing, * we use a hash table instead. */ typedef struct gfn_info { unsigned long gfn; domid_t domain; struct list_head list; } gfn_info_t; static inline void rmap_init(struct page_info *page) { /* We always start off as a doubly linked list. */ INIT_LIST_HEAD(&page->sharing->gfns); } /* Exceedingly simple "hash function" */ #define HASH(domain, gfn) \ (((gfn) + (domain)) % RMAP_HASHTAB_SIZE) /* Conversions. Tuned by the thresholds. Should only happen twice * (once each) during the lifetime of a shared page */ static inline int rmap_list_to_hash_table(struct page_info *page) { unsigned int i; struct list_head *pos, *tmp, *b = alloc_xenheap_pages(RMAP_HASHTAB_ORDER, 0); if ( b == NULL ) return -ENOMEM; for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ ) INIT_LIST_HEAD(b + i); list_for_each_safe(pos, tmp, &page->sharing->gfns) { gfn_info_t *gfn_info = list_entry(pos, gfn_info_t, list); struct list_head *bucket = b + HASH(gfn_info->domain, gfn_info->gfn); list_del(pos); list_add(pos, bucket); } page->sharing->hash_table.bucket = b; page->sharing->hash_table.flag = NULL; return 0; } static inline void rmap_hash_table_to_list(struct page_info *page) { unsigned int i; struct list_head *bucket = page->sharing->hash_table.bucket; INIT_LIST_HEAD(&page->sharing->gfns); for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ ) { struct list_head *pos, *tmp, *head = bucket + i; list_for_each_safe(pos, tmp, head) { list_del(pos); list_add(pos, &page->sharing->gfns); } } free_xenheap_pages(bucket, RMAP_HASHTAB_ORDER); } /* Generic accessors to the rmap */ static inline unsigned long rmap_count(struct page_info *pg) { unsigned long count; unsigned long t = read_atomic(&pg->u.inuse.type_info); count = t & PGT_count_mask; if ( t & PGT_locked ) count--; return count; } /* The page type count is always decreased after removing from the rmap. * Use a convert flag to avoid mutating the rmap if in the middle of an * iterator, or if the page will be soon destroyed anyways. */ static inline void rmap_del(gfn_info_t *gfn_info, struct page_info *page, int convert) { if ( RMAP_USES_HASHTAB(page) && convert && (rmap_count(page) <= RMAP_LIGHT_SHARED_PAGE) ) rmap_hash_table_to_list(page); /* Regardless of rmap type, same removal operation */ list_del(&gfn_info->list); } /* The page type count is always increased before adding to the rmap. */ static inline void rmap_add(gfn_info_t *gfn_info, struct page_info *page) { struct list_head *head; if ( !RMAP_USES_HASHTAB(page) && (rmap_count(page) >= RMAP_HEAVY_SHARED_PAGE) ) /* The conversion may fail with ENOMEM. We'll be less efficient, * but no reason to panic. */ (void)rmap_list_to_hash_table(page); head = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket + HASH(gfn_info->domain, gfn_info->gfn) : &page->sharing->gfns; INIT_LIST_HEAD(&gfn_info->list); list_add(&gfn_info->list, head); } static inline gfn_info_t * rmap_retrieve(uint16_t domain_id, unsigned long gfn, struct page_info *page) { gfn_info_t *gfn_info; struct list_head *le, *head; head = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket + HASH(domain_id, gfn) : &page->sharing->gfns; list_for_each(le, head) { gfn_info = list_entry(le, gfn_info_t, list); if ( (gfn_info->gfn == gfn) && (gfn_info->domain == domain_id) ) return gfn_info; } /* Nothing was found */ return NULL; } /* Returns true if the rmap has only one entry. O(1) complexity. */ static inline int rmap_has_one_entry(struct page_info *page) { return (rmap_count(page) == 1); } /* Returns true if the rmap has any entries. O(1) complexity. */ static inline int rmap_has_entries(struct page_info *page) { return (rmap_count(page) != 0); } /* The iterator hides the details of how the rmap is implemented. This * involves splitting the list_for_each_safe macro into two steps. */ struct rmap_iterator { struct list_head *curr; struct list_head *next; unsigned int bucket; }; static inline void rmap_seed_iterator(struct page_info *page, struct rmap_iterator *ri) { ri->curr = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket : &page->sharing->gfns; ri->next = ri->curr->next; ri->bucket = 0; } static inline gfn_info_t * rmap_iterate(struct page_info *page, struct rmap_iterator *ri) { struct list_head *head = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket + ri->bucket : &page->sharing->gfns; retry: if ( ri->next == head) { if ( RMAP_USES_HASHTAB(page) ) { ri->bucket++; if ( ri->bucket >= RMAP_HASHTAB_SIZE ) /* No more hash table buckets */ return NULL; head = page->sharing->hash_table.bucket + ri->bucket; ri->curr = head; ri->next = ri->curr->next; goto retry; } else /* List exhausted */ return NULL; } ri->curr = ri->next; ri->next = ri->curr->next; return list_entry(ri->curr, gfn_info_t, list); } static inline gfn_info_t *mem_sharing_gfn_alloc(struct page_info *page, struct domain *d, unsigned long gfn) { gfn_info_t *gfn_info = xmalloc(gfn_info_t); if ( gfn_info == NULL ) return NULL; gfn_info->gfn = gfn; gfn_info->domain = d->domain_id; rmap_add(gfn_info, page); /* Increment our number of shared pges. */ atomic_inc(&d->shr_pages); return gfn_info; } static inline void mem_sharing_gfn_destroy(struct page_info *page, struct domain *d, gfn_info_t *gfn_info) { /* Decrement the number of pages. */ atomic_dec(&d->shr_pages); /* Free the gfn_info structure. */ rmap_del(gfn_info, page, 1); xfree(gfn_info); } static struct page_info* mem_sharing_lookup(unsigned long mfn) { if ( mfn_valid(_mfn(mfn)) ) { struct page_info* page = mfn_to_page(_mfn(mfn)); if ( page_get_owner(page) == dom_cow ) { /* Count has to be at least two, because we're called * with the mfn locked (1) and this is supposed to be * a shared page (1). */ unsigned long t = read_atomic(&page->u.inuse.type_info); ASSERT((t & PGT_type_mask) == PGT_shared_page); ASSERT((t & PGT_count_mask) >= 2); ASSERT(get_gpfn_from_mfn(mfn) == SHARED_M2P_ENTRY); return page; } } return NULL; } #if MEM_SHARING_AUDIT int mem_sharing_audit(void) { int errors = 0; unsigned long count_expected; unsigned long count_found = 0; struct list_head *ae; count_expected = atomic_read(&nr_shared_mfns); rcu_read_lock(&shr_audit_read_lock); list_for_each_rcu(ae, &shr_audit_list) { struct page_sharing_info *pg_shared_info; unsigned long nr_gfns = 0; struct page_info *pg; mfn_t mfn; gfn_info_t *g; struct rmap_iterator ri; pg_shared_info = list_entry(ae, struct page_sharing_info, entry); pg = pg_shared_info->pg; mfn = page_to_mfn(pg); /* If we can't lock it, it's definitely not a shared page */ if ( !mem_sharing_page_lock(pg) ) { MEM_SHARING_DEBUG("mfn %lx in audit list, but cannot be locked (%lx)!\n", mfn_x(mfn), pg->u.inuse.type_info); errors++; continue; } /* Check if the MFN has correct type, owner and handle. */ if ( !(pg->u.inuse.type_info & PGT_shared_page) ) { MEM_SHARING_DEBUG("mfn %lx in audit list, but not PGT_shared_page (%lx)!\n", mfn_x(mfn), pg->u.inuse.type_info & PGT_type_mask); errors++; continue; } /* Check the page owner. */ if ( page_get_owner(pg) != dom_cow ) { MEM_SHARING_DEBUG("mfn %lx shared, but wrong owner (%hu)!\n", mfn_x(mfn), page_get_owner(pg)->domain_id); errors++; } /* Check the m2p entry */ if ( get_gpfn_from_mfn(mfn_x(mfn)) != SHARED_M2P_ENTRY ) { MEM_SHARING_DEBUG("mfn %lx shared, but wrong m2p entry (%lx)!\n", mfn_x(mfn), get_gpfn_from_mfn(mfn_x(mfn))); errors++; } /* Check we have a list */ if ( (!pg->sharing) || !rmap_has_entries(pg) ) { MEM_SHARING_DEBUG("mfn %lx shared, but empty gfn list!\n", mfn_x(mfn)); errors++; continue; } /* We've found a page that is shared */ count_found++; /* Check if all GFNs map to the MFN, and the p2m types */ rmap_seed_iterator(pg, &ri); while ( (g = rmap_iterate(pg, &ri)) != NULL ) { struct domain *d; p2m_type_t t; mfn_t o_mfn; d = get_domain_by_id(g->domain); if ( d == NULL ) { MEM_SHARING_DEBUG("Unknown dom: %hu, for PFN=%lx, MFN=%lx\n", g->domain, g->gfn, mfn_x(mfn)); errors++; continue; } o_mfn = get_gfn_query_unlocked(d, g->gfn, &t); if ( mfn_x(o_mfn) != mfn_x(mfn) ) { MEM_SHARING_DEBUG("Incorrect P2M for d=%hu, PFN=%lx." "Expecting MFN=%lx, got %lx\n", g->domain, g->gfn, mfn_x(mfn), mfn_x(o_mfn)); errors++; } if ( t != p2m_ram_shared ) { MEM_SHARING_DEBUG("Incorrect P2M type for d=%hu, PFN=%lx MFN=%lx." "Expecting t=%d, got %d\n", g->domain, g->gfn, mfn_x(mfn), p2m_ram_shared, t); errors++; } put_domain(d); nr_gfns++; } /* The type count has an extra ref because we have locked the page */ if ( (nr_gfns + 1) != (pg->u.inuse.type_info & PGT_count_mask) ) { MEM_SHARING_DEBUG("Mismatched counts for MFN=%lx." "nr_gfns in list %lu, in type_info %lx\n", mfn_x(mfn), nr_gfns, (pg->u.inuse.type_info & PGT_count_mask)); errors++; } mem_sharing_page_unlock(pg); } rcu_read_unlock(&shr_audit_read_lock); if ( count_found != count_expected ) { MEM_SHARING_DEBUG("Expected %ld shared mfns, found %ld.", count_expected, count_found); errors++; } return errors; } #endif int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn, bool_t allow_sleep) { struct vcpu *v = current; int rc; mem_event_request_t req = { .gfn = gfn }; if ( (rc = __mem_event_claim_slot(d, &d->mem_event->share, allow_sleep)) < 0 ) return rc; if ( v->domain == d ) { req.flags = MEM_EVENT_FLAG_VCPU_PAUSED; vcpu_pause_nosync(v); } req.p2mt = p2m_ram_shared; req.vcpu_id = v->vcpu_id; mem_event_put_request(d, &d->mem_event->share, &req); return 0; } unsigned int mem_sharing_get_nr_saved_mfns(void) { return ((unsigned int)atomic_read(&nr_saved_mfns)); } unsigned int mem_sharing_get_nr_shared_mfns(void) { return (unsigned int)atomic_read(&nr_shared_mfns); } int mem_sharing_sharing_resume(struct domain *d) { mem_event_response_t rsp; /* Get all requests off the ring */ while ( mem_event_get_response(d, &d->mem_event->share, &rsp) ) { if ( rsp.flags & MEM_EVENT_FLAG_DUMMY ) continue; /* Unpause domain/vcpu */ if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) vcpu_unpause(d->vcpu[rsp.vcpu_id]); } return 0; } /* Functions that change a page's type and ownership */ static int page_make_sharable(struct domain *d, struct page_info *page, int expected_refcnt) { int drop_dom_ref; spin_lock(&d->page_alloc_lock); /* Change page type and count atomically */ if ( !get_page_and_type(page, d, PGT_shared_page) ) { spin_unlock(&d->page_alloc_lock); return -EINVAL; } /* Check it wasn't already sharable and undo if it was */ if ( (page->u.inuse.type_info & PGT_count_mask) != 1 ) { put_page_and_type(page); spin_unlock(&d->page_alloc_lock); return -EEXIST; } /* Check if the ref count is 2. The first from PGC_allocated, and * the second from get_page_and_type at the top of this function */ if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) ) { /* Return type count back to zero */ put_page_and_type(page); spin_unlock(&d->page_alloc_lock); return -E2BIG; } page_set_owner(page, dom_cow); domain_adjust_tot_pages(d, -1); drop_dom_ref = (d->tot_pages == 0); page_list_del(page, &d->page_list); spin_unlock(&d->page_alloc_lock); if ( drop_dom_ref ) put_domain(d); return 0; } static int page_make_private(struct domain *d, struct page_info *page) { unsigned long expected_type; if ( !get_page(page, dom_cow) ) return -EINVAL; spin_lock(&d->page_alloc_lock); /* We can only change the type if count is one */ /* Because we are locking pages individually, we need to drop * the lock here, while the page is typed. We cannot risk the * race of page_unlock and then put_page_type. */ expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2); if ( page->u.inuse.type_info != expected_type ) { put_page(page); spin_unlock(&d->page_alloc_lock); return -EEXIST; } /* Drop the final typecount */ put_page_and_type(page); /* Now that we've dropped the type, we can unlock */ mem_sharing_page_unlock(page); /* Change the owner */ ASSERT(page_get_owner(page) == dom_cow); page_set_owner(page, d); if ( domain_adjust_tot_pages(d, 1) == 1 ) get_domain(d); page_list_add_tail(page, &d->page_list); spin_unlock(&d->page_alloc_lock); put_page(page); return 0; } static inline struct page_info *__grab_shared_page(mfn_t mfn) { struct page_info *pg = NULL; if ( !mfn_valid(mfn) ) return NULL; pg = mfn_to_page(mfn); /* If the page is not validated we can't lock it, and if it's * not validated it's obviously not shared. */ if ( !mem_sharing_page_lock(pg) ) return NULL; if ( mem_sharing_lookup(mfn_x(mfn)) == NULL ) { mem_sharing_page_unlock(pg); return NULL; } return pg; } int mem_sharing_debug_mfn(mfn_t mfn) { struct page_info *page; int num_refs; if ( (page = __grab_shared_page(mfn)) == NULL) { gdprintk(XENLOG_ERR, "Invalid MFN=%lx\n", mfn_x(mfn)); return -EINVAL; } MEM_SHARING_DEBUG( "Debug page: MFN=%lx is ci=%lx, ti=%lx, owner_id=%d\n", mfn_x(page_to_mfn(page)), page->count_info, page->u.inuse.type_info, page_get_owner(page)->domain_id); /* -1 because the page is locked and that's an additional type ref */ num_refs = ((int) (page->u.inuse.type_info & PGT_count_mask)) - 1; mem_sharing_page_unlock(page); return num_refs; } int mem_sharing_debug_gfn(struct domain *d, unsigned long gfn) { p2m_type_t p2mt; mfn_t mfn; int num_refs; mfn = get_gfn_query(d, gfn, &p2mt); MEM_SHARING_DEBUG("Debug for domain=%d, gfn=%lx, ", d->domain_id, gfn); num_refs = mem_sharing_debug_mfn(mfn); put_gfn(d, gfn); return num_refs; } #define SHGNT_PER_PAGE_V1 (PAGE_SIZE / sizeof(grant_entry_v1_t)) #define shared_entry_v1(t, e) \ ((t)->shared_v1[(e)/SHGNT_PER_PAGE_V1][(e)%SHGNT_PER_PAGE_V1]) #define SHGNT_PER_PAGE_V2 (PAGE_SIZE / sizeof(grant_entry_v2_t)) #define shared_entry_v2(t, e) \ ((t)->shared_v2[(e)/SHGNT_PER_PAGE_V2][(e)%SHGNT_PER_PAGE_V2]) #define STGNT_PER_PAGE (PAGE_SIZE / sizeof(grant_status_t)) #define status_entry(t, e) \ ((t)->status[(e)/STGNT_PER_PAGE][(e)%STGNT_PER_PAGE]) static grant_entry_header_t * shared_entry_header(struct grant_table *t, grant_ref_t ref) { ASSERT (t->gt_version != 0); if ( t->gt_version == 1 ) return (grant_entry_header_t*)&shared_entry_v1(t, ref); else return &shared_entry_v2(t, ref).hdr; } static int mem_sharing_gref_to_gfn(struct domain *d, grant_ref_t ref, unsigned long *gfn) { if ( d->grant_table->gt_version < 1 ) return -1; if ( d->grant_table->gt_version == 1 ) { grant_entry_v1_t *sha1; sha1 = &shared_entry_v1(d->grant_table, ref); *gfn = sha1->frame; } else { grant_entry_v2_t *sha2; sha2 = &shared_entry_v2(d->grant_table, ref); *gfn = sha2->full_page.frame; } return 0; } int mem_sharing_debug_gref(struct domain *d, grant_ref_t ref) { grant_entry_header_t *shah; uint16_t status; unsigned long gfn; if ( d->grant_table->gt_version < 1 ) { MEM_SHARING_DEBUG( "Asked to debug [dom=%d,gref=%d], but not yet inited.\n", d->domain_id, ref); return -EINVAL; } (void)mem_sharing_gref_to_gfn(d, ref, &gfn); shah = shared_entry_header(d->grant_table, ref); if ( d->grant_table->gt_version == 1 ) status = shah->flags; else status = status_entry(d->grant_table, ref); MEM_SHARING_DEBUG( "==> Grant [dom=%d,ref=%d], status=%x. ", d->domain_id, ref, status); return mem_sharing_debug_gfn(d, gfn); } int mem_sharing_nominate_page(struct domain *d, unsigned long gfn, int expected_refcnt, shr_handle_t *phandle) { p2m_type_t p2mt; mfn_t mfn; struct page_info *page = NULL; /* gcc... */ int ret; struct gfn_info *gfn_info; *phandle = 0UL; mfn = get_gfn(d, gfn, &p2mt); /* Check if mfn is valid */ ret = -EINVAL; if ( !mfn_valid(mfn) ) goto out; /* Return the handle if the page is already shared */ if ( p2m_is_shared(p2mt) ) { struct page_info *pg = __grab_shared_page(mfn); if ( !pg ) { gdprintk(XENLOG_ERR, "Shared p2m entry gfn %lx, but could not " "grab page %lx dom %d\n", gfn, mfn_x(mfn), d->domain_id); BUG(); } *phandle = pg->sharing->handle; ret = 0; mem_sharing_page_unlock(pg); goto out; } /* Check p2m type */ if ( !p2m_is_sharable(p2mt) ) goto out; /* Try to convert the mfn to the sharable type */ page = mfn_to_page(mfn); ret = page_make_sharable(d, page, expected_refcnt); if ( ret ) goto out; /* Now that the page is validated, we can lock it. There is no * race because we're holding the p2m entry, so no one else * could be nominating this gfn */ ret = -ENOENT; if ( !mem_sharing_page_lock(page) ) goto out; /* Initialize the shared state */ ret = -ENOMEM; if ( (page->sharing = xmalloc(struct page_sharing_info)) == NULL ) { /* Making a page private atomically unlocks it */ BUG_ON(page_make_private(d, page) != 0); goto out; } page->sharing->pg = page; rmap_init(page); /* Create the handle */ page->sharing->handle = get_next_handle(); /* Create the local gfn info */ if ( (gfn_info = mem_sharing_gfn_alloc(page, d, gfn)) == NULL ) { xfree(page->sharing); page->sharing = NULL; BUG_ON(page_make_private(d, page) != 0); goto out; } /* Change the p2m type, should never fail with p2m locked. */ BUG_ON(p2m_change_type(d, gfn, p2mt, p2m_ram_shared) != p2mt); /* Account for this page. */ atomic_inc(&nr_shared_mfns); /* Update m2p entry to SHARED_M2P_ENTRY */ set_gpfn_from_mfn(mfn_x(mfn), SHARED_M2P_ENTRY); *phandle = page->sharing->handle; audit_add_list(page); mem_sharing_page_unlock(page); ret = 0; out: put_gfn(d, gfn); return ret; } int mem_sharing_share_pages(struct domain *sd, unsigned long sgfn, shr_handle_t sh, struct domain *cd, unsigned long cgfn, shr_handle_t ch) { struct page_info *spage, *cpage, *firstpg, *secondpg; gfn_info_t *gfn; struct domain *d; int ret = -EINVAL; mfn_t smfn, cmfn; p2m_type_t smfn_type, cmfn_type; struct two_gfns tg; struct rmap_iterator ri; get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn, cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg); /* This tricky business is to avoid two callers deadlocking if * grabbing pages in opposite client/source order */ if( mfn_x(smfn) == mfn_x(cmfn) ) { /* The pages are already the same. We could return some * kind of error here, but no matter how you look at it, * the pages are already 'shared'. It possibly represents * a big problem somewhere else, but as far as sharing is * concerned: great success! */ ret = 0; goto err_out; } else if ( mfn_x(smfn) < mfn_x(cmfn) ) { ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; spage = firstpg = __grab_shared_page(smfn); if ( spage == NULL ) goto err_out; ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; cpage = secondpg = __grab_shared_page(cmfn); if ( cpage == NULL ) { mem_sharing_page_unlock(spage); goto err_out; } } else { ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; cpage = firstpg = __grab_shared_page(cmfn); if ( cpage == NULL ) goto err_out; ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; spage = secondpg = __grab_shared_page(smfn); if ( spage == NULL ) { mem_sharing_page_unlock(cpage); goto err_out; } } ASSERT(smfn_type == p2m_ram_shared); ASSERT(cmfn_type == p2m_ram_shared); /* Check that the handles match */ if ( spage->sharing->handle != sh ) { ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; mem_sharing_page_unlock(secondpg); mem_sharing_page_unlock(firstpg); goto err_out; } if ( cpage->sharing->handle != ch ) { ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; mem_sharing_page_unlock(secondpg); mem_sharing_page_unlock(firstpg); goto err_out; } /* Merge the lists together */ rmap_seed_iterator(cpage, &ri); while ( (gfn = rmap_iterate(cpage, &ri)) != NULL) { /* Get the source page and type, this should never fail: * we are under shr lock, and got a successful lookup */ BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page)); /* Move the gfn_info from client list to source list. * Don't change the type of rmap for the client page. */ rmap_del(gfn, cpage, 0); rmap_add(gfn, spage); put_page_and_type(cpage); d = get_domain_by_id(gfn->domain); BUG_ON(!d); BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn) == 0); put_domain(d); } ASSERT(list_empty(&cpage->sharing->gfns)); /* Clear the rest of the shared state */ page_sharing_dispose(cpage); cpage->sharing = NULL; mem_sharing_page_unlock(secondpg); mem_sharing_page_unlock(firstpg); /* Free the client page */ if(test_and_clear_bit(_PGC_allocated, &cpage->count_info)) put_page(cpage); /* We managed to free a domain page. */ atomic_dec(&nr_shared_mfns); atomic_inc(&nr_saved_mfns); ret = 0; err_out: put_two_gfns(&tg); return ret; } int mem_sharing_add_to_physmap(struct domain *sd, unsigned long sgfn, shr_handle_t sh, struct domain *cd, unsigned long cgfn) { struct page_info *spage; int ret = -EINVAL; mfn_t smfn, cmfn; p2m_type_t smfn_type, cmfn_type; struct gfn_info *gfn_info; struct p2m_domain *p2m = p2m_get_hostp2m(cd); p2m_access_t a; struct two_gfns tg; get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn, cd, cgfn, &cmfn_type, &a, &cmfn, 0, &tg); /* Get the source shared page, check and lock */ ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; spage = __grab_shared_page(smfn); if ( spage == NULL ) goto err_out; ASSERT(smfn_type == p2m_ram_shared); /* Check that the handles match */ if ( spage->sharing->handle != sh ) goto err_unlock; /* Make sure the target page is a hole in the physmap. These are typically * p2m_mmio_dm, but also accept p2m_invalid and paged out pages. See the * definition of p2m_is_hole in p2m.h. */ if ( !p2m_is_hole(cmfn_type) ) { ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; goto err_unlock; } /* This is simpler than regular sharing */ BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page)); if ( (gfn_info = mem_sharing_gfn_alloc(spage, cd, cgfn)) == NULL ) { put_page_and_type(spage); ret = -ENOMEM; goto err_unlock; } ret = set_p2m_entry(p2m, cgfn, smfn, PAGE_ORDER_4K, p2m_ram_shared, a); /* Tempted to turn this into an assert */ if ( !ret ) { ret = -ENOENT; mem_sharing_gfn_destroy(spage, cd, gfn_info); put_page_and_type(spage); } else { ret = 0; /* There is a chance we're plugging a hole where a paged out page was */ if ( p2m_is_paging(cmfn_type) && (cmfn_type != p2m_ram_paging_out) ) { atomic_dec(&cd->paged_pages); /* Further, there is a chance this was a valid page. Don't leak it. */ if ( mfn_valid(cmfn) ) { struct page_info *cpage = mfn_to_page(cmfn); ASSERT(cpage != NULL); if ( test_and_clear_bit(_PGC_allocated, &cpage->count_info) ) put_page(cpage); } } } atomic_inc(&nr_saved_mfns); err_unlock: mem_sharing_page_unlock(spage); err_out: put_two_gfns(&tg); return ret; } /* A note on the rationale for unshare error handling: * 1. Unshare can only fail with ENOMEM. Any other error conditions BUG_ON()'s * 2. We notify a potential dom0 helper through a mem_event ring. But we * allow the notification to not go to sleep. If the event ring is full * of ENOMEM warnings, then it's on the ball. * 3. We cannot go to sleep until the unshare is resolved, because we might * be buried deep into locks (e.g. something -> copy_to_user -> __hvm_copy) * 4. So, we make sure we: * 4.1. return an error * 4.2. do not corrupt shared memory * 4.3. do not corrupt guest memory * 4.4. let the guest deal with it if the error propagation will reach it */ int __mem_sharing_unshare_page(struct domain *d, unsigned long gfn, uint16_t flags) { p2m_type_t p2mt; mfn_t mfn; struct page_info *page, *old_page; void *s, *t; int last_gfn; gfn_info_t *gfn_info = NULL; mfn = get_gfn(d, gfn, &p2mt); /* Has someone already unshared it? */ if ( !p2m_is_shared(p2mt) ) { put_gfn(d, gfn); return 0; } page = __grab_shared_page(mfn); if ( page == NULL ) { gdprintk(XENLOG_ERR, "Domain p2m is shared, but page is not: " "%lx\n", gfn); BUG(); } gfn_info = rmap_retrieve(d->domain_id, gfn, page); if ( unlikely(gfn_info == NULL) ) { gdprintk(XENLOG_ERR, "Could not find gfn_info for shared gfn: " "%lx\n", gfn); BUG(); } /* Do the accounting first. If anything fails below, we have bigger * bigger fish to fry. First, remove the gfn from the list. */ last_gfn = rmap_has_one_entry(page); if ( last_gfn ) { /* Clean up shared state. Get rid of the tuple * before destroying the rmap. */ mem_sharing_gfn_destroy(page, d, gfn_info); page_sharing_dispose(page); page->sharing = NULL; atomic_dec(&nr_shared_mfns); } else atomic_dec(&nr_saved_mfns); /* If the GFN is getting destroyed drop the references to MFN * (possibly freeing the page), and exit early */ if ( flags & MEM_SHARING_DESTROY_GFN ) { if ( !last_gfn ) mem_sharing_gfn_destroy(page, d, gfn_info); put_page_and_type(page); mem_sharing_page_unlock(page); if ( last_gfn && test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); put_gfn(d, gfn); return 0; } if ( last_gfn ) { /* Making a page private atomically unlocks it */ BUG_ON(page_make_private(d, page) != 0); goto private_page_found; } old_page = page; page = alloc_domheap_page(d, 0); if ( !page ) { /* Undo dec of nr_saved_mfns, as the retry will decrease again. */ atomic_inc(&nr_saved_mfns); mem_sharing_page_unlock(old_page); put_gfn(d, gfn); /* Caller is responsible for placing an event * in the ring */ return -ENOMEM; } s = map_domain_page(__page_to_mfn(old_page)); t = map_domain_page(__page_to_mfn(page)); memcpy(t, s, PAGE_SIZE); unmap_domain_page(s); unmap_domain_page(t); BUG_ON(set_shared_p2m_entry(d, gfn, page_to_mfn(page)) == 0); mem_sharing_gfn_destroy(old_page, d, gfn_info); mem_sharing_page_unlock(old_page); put_page_and_type(old_page); private_page_found: if ( p2m_change_type(d, gfn, p2m_ram_shared, p2m_ram_rw) != p2m_ram_shared ) { gdprintk(XENLOG_ERR, "Could not change p2m type d %hu gfn %lx.\n", d->domain_id, gfn); BUG(); } /* Update m2p entry */ set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), gfn); /* Now that the gfn<->mfn map is properly established, * marking dirty is feasible */ paging_mark_dirty(d, mfn_x(page_to_mfn(page))); /* We do not need to unlock a private page */ put_gfn(d, gfn); return 0; } int relinquish_shared_pages(struct domain *d) { int rc = 0; struct p2m_domain *p2m = p2m_get_hostp2m(d); unsigned long gfn, count = 0; if ( p2m == NULL ) return 0; p2m_lock(p2m); for (gfn = p2m->next_shared_gfn_to_relinquish; gfn < p2m->max_mapped_pfn; gfn++ ) { p2m_access_t a; p2m_type_t t; mfn_t mfn; int set_rc; if ( atomic_read(&d->shr_pages) == 0 ) break; mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL); if ( mfn_valid(mfn) && (t == p2m_ram_shared) ) { /* Does not fail with ENOMEM given the DESTROY flag */ BUG_ON(__mem_sharing_unshare_page(d, gfn, MEM_SHARING_DESTROY_GFN)); /* Clear out the p2m entry so no one else may try to * unshare. Must succeed: we just read the old entry and * we hold the p2m lock. */ set_rc = p2m->set_entry(p2m, gfn, _mfn(0), PAGE_ORDER_4K, p2m_invalid, p2m_access_rwx); ASSERT(set_rc != 0); count++; } /* Preempt every 2MiB. Arbitrary */ if ( (count == 512) && hypercall_preempt_check() ) { p2m->next_shared_gfn_to_relinquish = gfn + 1; rc = -EAGAIN; break; } } p2m_unlock(p2m); return rc; } int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec) { int rc = 0; /* Only HAP is supported */ if ( !hap_enabled(d) || !d->arch.hvm_domain.mem_sharing_enabled ) return -ENODEV; switch(mec->op) { case XENMEM_sharing_op_nominate_gfn: { unsigned long gfn = mec->u.nominate.u.gfn; shr_handle_t handle; if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = mem_sharing_nominate_page(d, gfn, 0, &handle); mec->u.nominate.handle = handle; } break; case XENMEM_sharing_op_nominate_gref: { grant_ref_t gref = mec->u.nominate.u.grant_ref; unsigned long gfn; shr_handle_t handle; if ( !mem_sharing_enabled(d) ) return -EINVAL; if ( mem_sharing_gref_to_gfn(d, gref, &gfn) < 0 ) return -EINVAL; rc = mem_sharing_nominate_page(d, gfn, 3, &handle); mec->u.nominate.handle = handle; } break; case XENMEM_sharing_op_share: { unsigned long sgfn, cgfn; struct domain *cd; shr_handle_t sh, ch; if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = rcu_lock_live_remote_domain_by_id(mec->u.share.client_domain, &cd); if ( rc ) return rc; rc = xsm_mem_sharing_op(XSM_TARGET, d, cd, mec->op); if ( rc ) { rcu_unlock_domain(cd); return rc; } if ( !mem_sharing_enabled(cd) ) { rcu_unlock_domain(cd); return -EINVAL; } if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.source_gfn) ) { grant_ref_t gref = (grant_ref_t) (XENMEM_SHARING_OP_FIELD_GET_GREF( mec->u.share.source_gfn)); if ( mem_sharing_gref_to_gfn(d, gref, &sgfn) < 0 ) { rcu_unlock_domain(cd); return -EINVAL; } } else { sgfn = mec->u.share.source_gfn; } if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.client_gfn) ) { grant_ref_t gref = (grant_ref_t) (XENMEM_SHARING_OP_FIELD_GET_GREF( mec->u.share.client_gfn)); if ( mem_sharing_gref_to_gfn(cd, gref, &cgfn) < 0 ) { rcu_unlock_domain(cd); return -EINVAL; } } else { cgfn = mec->u.share.client_gfn; } sh = mec->u.share.source_handle; ch = mec->u.share.client_handle; rc = mem_sharing_share_pages(d, sgfn, sh, cd, cgfn, ch); rcu_unlock_domain(cd); } break; case XENMEM_sharing_op_add_physmap: { unsigned long sgfn, cgfn; struct domain *cd; shr_handle_t sh; if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = rcu_lock_live_remote_domain_by_id(mec->u.share.client_domain, &cd); if ( rc ) return rc; rc = xsm_mem_sharing_op(XSM_TARGET, d, cd, mec->op); if ( rc ) { rcu_unlock_domain(cd); return rc; } if ( !mem_sharing_enabled(cd) ) { rcu_unlock_domain(cd); return -EINVAL; } if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.source_gfn) ) { /* Cannot add a gref to the physmap */ rcu_unlock_domain(cd); return -EINVAL; } sgfn = mec->u.share.source_gfn; sh = mec->u.share.source_handle; cgfn = mec->u.share.client_gfn; rc = mem_sharing_add_to_physmap(d, sgfn, sh, cd, cgfn); rcu_unlock_domain(cd); } break; case XENMEM_sharing_op_resume: { if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = mem_sharing_sharing_resume(d); } break; case XENMEM_sharing_op_debug_gfn: { unsigned long gfn = mec->u.debug.u.gfn; rc = mem_sharing_debug_gfn(d, gfn); } break; case XENMEM_sharing_op_debug_gref: { grant_ref_t gref = mec->u.debug.u.gref; rc = mem_sharing_debug_gref(d, gref); } break; default: rc = -ENOSYS; break; } return rc; } int mem_sharing_domctl(struct domain *d, xen_domctl_mem_sharing_op_t *mec) { int rc; /* Only HAP is supported */ if ( !hap_enabled(d) ) return -ENODEV; switch(mec->op) { case XEN_DOMCTL_MEM_SHARING_CONTROL: { rc = 0; if ( unlikely(need_iommu(d) && mec->u.enable) ) rc = -EXDEV; else d->arch.hvm_domain.mem_sharing_enabled = mec->u.enable; } break; default: rc = -ENOSYS; } return rc; } void __init mem_sharing_init(void) { printk("Initing memory sharing.\n"); #if MEM_SHARING_AUDIT spin_lock_init(&shr_audit_lock); INIT_LIST_HEAD(&shr_audit_list); #endif }