/****************************************************************************** * arch/x86/mm/mem_event.c * * Memory event support. * * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include /* for public/io/ring.h macros */ #define xen_mb() mb() #define xen_rmb() rmb() #define xen_wmb() wmb() #define mem_event_ring_lock_init(_med) spin_lock_init(&(_med)->ring_lock) #define mem_event_ring_lock(_med) spin_lock(&(_med)->ring_lock) #define mem_event_ring_unlock(_med) spin_unlock(&(_med)->ring_lock) static int mem_event_enable( struct domain *d, xen_domctl_mem_event_op_t *mec, struct mem_event_domain *med, int pause_flag, int param, xen_event_channel_notification_t notification_fn) { int rc; unsigned long ring_gfn = d->arch.hvm_domain.params[param]; /* Only one helper at a time. If the helper crashed, * the ring is in an undefined state and so is the guest. */ if ( med->ring_page ) return -EBUSY; /* The parameter defaults to zero, and it should be * set to something */ if ( ring_gfn == 0 ) return -ENOSYS; mem_event_ring_lock_init(med); mem_event_ring_lock(med); rc = prepare_ring_for_helper(d, ring_gfn, &med->ring_pg_struct, &med->ring_page); if ( rc < 0 ) goto err; /* Set the number of currently blocked vCPUs to 0. */ med->blocked = 0; /* Allocate event channel */ rc = alloc_unbound_xen_event_channel(d->vcpu[0], current->domain->domain_id, notification_fn); if ( rc < 0 ) goto err; med->xen_port = mec->port = rc; /* Prepare ring buffer */ FRONT_RING_INIT(&med->front_ring, (mem_event_sring_t *)med->ring_page, PAGE_SIZE); /* Save the pause flag for this particular ring. */ med->pause_flag = pause_flag; /* Initialize the last-chance wait queue. */ init_waitqueue_head(&med->wq); mem_event_ring_unlock(med); return 0; err: destroy_ring_for_helper(&med->ring_page, med->ring_pg_struct); mem_event_ring_unlock(med); return rc; } static unsigned int mem_event_ring_available(struct mem_event_domain *med) { int avail_req = RING_FREE_REQUESTS(&med->front_ring); avail_req -= med->target_producers; avail_req -= med->foreign_producers; BUG_ON(avail_req < 0); return avail_req; } /* * mem_event_wake_blocked() will wakeup vcpus waiting for room in the * ring. These vCPUs were paused on their way out after placing an event, * but need to be resumed where the ring is capable of processing at least * one event from them. */ static void mem_event_wake_blocked(struct domain *d, struct mem_event_domain *med) { struct vcpu *v; int online = d->max_vcpus; unsigned int avail_req = mem_event_ring_available(med); if ( avail_req == 0 || med->blocked == 0 ) return; /* * We ensure that we only have vCPUs online if there are enough free slots * for their memory events to be processed. This will ensure that no * memory events are lost (due to the fact that certain types of events * cannot be replayed, we need to ensure that there is space in the ring * for when they are hit). * See comment below in mem_event_put_request(). */ for_each_vcpu ( d, v ) if ( test_bit(med->pause_flag, &v->pause_flags) ) online--; ASSERT(online == (d->max_vcpus - med->blocked)); /* We remember which vcpu last woke up to avoid scanning always linearly * from zero and starving higher-numbered vcpus under high load */ if ( d->vcpu ) { int i, j, k; for (i = med->last_vcpu_wake_up + 1, j = 0; j < d->max_vcpus; i++, j++) { k = i % d->max_vcpus; v = d->vcpu[k]; if ( !v ) continue; if ( !(med->blocked) || online >= avail_req ) break; if ( test_and_clear_bit(med->pause_flag, &v->pause_flags) ) { vcpu_unpause(v); online++; med->blocked--; med->last_vcpu_wake_up = k; } } } } /* * In the event that a vCPU attempted to place an event in the ring and * was unable to do so, it is queued on a wait queue. These are woken as * needed, and take precedence over the blocked vCPUs. */ static void mem_event_wake_queued(struct domain *d, struct mem_event_domain *med) { unsigned int avail_req = mem_event_ring_available(med); if ( avail_req > 0 ) wake_up_nr(&med->wq, avail_req); } /* * mem_event_wake() will wakeup all vcpus waiting for the ring to * become available. If we have queued vCPUs, they get top priority. We * are guaranteed that they will go through code paths that will eventually * call mem_event_wake() again, ensuring that any blocked vCPUs will get * unpaused once all the queued vCPUs have made it through. */ void mem_event_wake(struct domain *d, struct mem_event_domain *med) { if (!list_empty(&med->wq.list)) mem_event_wake_queued(d, med); else mem_event_wake_blocked(d, med); } static int mem_event_disable(struct domain *d, struct mem_event_domain *med) { if ( med->ring_page ) { struct vcpu *v; mem_event_ring_lock(med); if ( !list_empty(&med->wq.list) ) { mem_event_ring_unlock(med); return -EBUSY; } /* Free domU's event channel and leave the other one unbound */ free_xen_event_channel(d->vcpu[0], med->xen_port); /* Unblock all vCPUs */ for_each_vcpu ( d, v ) { if ( test_and_clear_bit(med->pause_flag, &v->pause_flags) ) { vcpu_unpause(v); med->blocked--; } } destroy_ring_for_helper(&med->ring_page, med->ring_pg_struct); mem_event_ring_unlock(med); } return 0; } static inline void mem_event_release_slot(struct domain *d, struct mem_event_domain *med) { /* Update the accounting */ if ( current->domain == d ) med->target_producers--; else med->foreign_producers--; /* Kick any waiters */ mem_event_wake(d, med); } /* * mem_event_mark_and_pause() tags vcpu and put it to sleep. * The vcpu will resume execution in mem_event_wake_waiters(). */ void mem_event_mark_and_pause(struct vcpu *v, struct mem_event_domain *med) { if ( !test_and_set_bit(med->pause_flag, &v->pause_flags) ) { vcpu_pause_nosync(v); med->blocked++; } } /* * This must be preceded by a call to claim_slot(), and is guaranteed to * succeed. As a side-effect however, the vCPU may be paused if the ring is * overly full and its continued execution would cause stalling and excessive * waiting. The vCPU will be automatically unpaused when the ring clears. */ void mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req) { mem_event_front_ring_t *front_ring; int free_req; unsigned int avail_req; RING_IDX req_prod; if ( current->domain != d ) { req->flags |= MEM_EVENT_FLAG_FOREIGN; ASSERT( !(req->flags & MEM_EVENT_FLAG_VCPU_PAUSED) ); } mem_event_ring_lock(med); /* Due to the reservations, this step must succeed. */ front_ring = &med->front_ring; free_req = RING_FREE_REQUESTS(front_ring); ASSERT(free_req > 0); /* Copy request */ req_prod = front_ring->req_prod_pvt; memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req)); req_prod++; /* Update ring */ front_ring->req_prod_pvt = req_prod; RING_PUSH_REQUESTS(front_ring); /* We've actually *used* our reservation, so release the slot. */ mem_event_release_slot(d, med); /* Give this vCPU a black eye if necessary, on the way out. * See the comments above wake_blocked() for more information * on how this mechanism works to avoid waiting. */ avail_req = mem_event_ring_available(med); if( current->domain == d && avail_req < d->max_vcpus ) mem_event_mark_and_pause(current, med); mem_event_ring_unlock(med); notify_via_xen_event_channel(d, med->xen_port); } int mem_event_get_response(struct domain *d, struct mem_event_domain *med, mem_event_response_t *rsp) { mem_event_front_ring_t *front_ring; RING_IDX rsp_cons; mem_event_ring_lock(med); front_ring = &med->front_ring; rsp_cons = front_ring->rsp_cons; if ( !RING_HAS_UNCONSUMED_RESPONSES(front_ring) ) { mem_event_ring_unlock(med); return 0; } /* Copy response */ memcpy(rsp, RING_GET_RESPONSE(front_ring, rsp_cons), sizeof(*rsp)); rsp_cons++; /* Update ring */ front_ring->rsp_cons = rsp_cons; front_ring->sring->rsp_event = rsp_cons + 1; /* Kick any waiters -- since we've just consumed an event, * there may be additional space available in the ring. */ mem_event_wake(d, med); mem_event_ring_unlock(med); return 1; } void mem_event_cancel_slot(struct domain *d, struct mem_event_domain *med) { mem_event_ring_lock(med); mem_event_release_slot(d, med); mem_event_ring_unlock(med); } static int mem_event_grab_slot(struct mem_event_domain *med, int foreign) { unsigned int avail_req; if ( !med->ring_page ) return -ENOSYS; mem_event_ring_lock(med); avail_req = mem_event_ring_available(med); if ( avail_req == 0 ) { mem_event_ring_unlock(med); return -EBUSY; } if ( !foreign ) med->target_producers++; else med->foreign_producers++; mem_event_ring_unlock(med); return 0; } /* Simple try_grab wrapper for use in the wait_event() macro. */ static int mem_event_wait_try_grab(struct mem_event_domain *med, int *rc) { *rc = mem_event_grab_slot(med, 0); return *rc; } /* Call mem_event_grab_slot() until the ring doesn't exist, or is available. */ static int mem_event_wait_slot(struct mem_event_domain *med) { int rc = -EBUSY; wait_event(med->wq, mem_event_wait_try_grab(med, &rc) != -EBUSY); return rc; } bool_t mem_event_check_ring(struct mem_event_domain *med) { return (med->ring_page != NULL); } /* * Determines whether or not the current vCPU belongs to the target domain, * and calls the appropriate wait function. If it is a guest vCPU, then we * use mem_event_wait_slot() to reserve a slot. As long as there is a ring, * this function will always return 0 for a guest. For a non-guest, we check * for space and return -EBUSY if the ring is not available. * * Return codes: -ENOSYS: the ring is not yet configured * -EBUSY: the ring is busy * 0: a spot has been reserved * */ int __mem_event_claim_slot(struct domain *d, struct mem_event_domain *med, bool_t allow_sleep) { if ( (current->domain == d) && allow_sleep ) return mem_event_wait_slot(med); else return mem_event_grab_slot(med, (current->domain != d)); } /* Registered with Xen-bound event channel for incoming notifications. */ static void mem_paging_notification(struct vcpu *v, unsigned int port) { if ( likely(v->domain->mem_event->paging.ring_page != NULL) ) p2m_mem_paging_resume(v->domain); } /* Registered with Xen-bound event channel for incoming notifications. */ static void mem_access_notification(struct vcpu *v, unsigned int port) { if ( likely(v->domain->mem_event->access.ring_page != NULL) ) p2m_mem_access_resume(v->domain); } /* Registered with Xen-bound event channel for incoming notifications. */ static void mem_sharing_notification(struct vcpu *v, unsigned int port) { if ( likely(v->domain->mem_event->share.ring_page != NULL) ) mem_sharing_sharing_resume(v->domain); } int do_mem_event_op(int op, uint32_t domain, void *arg) { int ret; struct domain *d; ret = rcu_lock_live_remote_domain_by_id(domain, &d); if ( ret ) return ret; ret = xsm_mem_event_op(XSM_TARGET, d, op); if ( ret ) goto out; switch (op) { case XENMEM_paging_op: ret = mem_paging_memop(d, (xen_mem_event_op_t *) arg); break; case XENMEM_access_op: ret = mem_access_memop(d, (xen_mem_event_op_t *) arg); break; case XENMEM_sharing_op: ret = mem_sharing_memop(d, (xen_mem_sharing_op_t *) arg); break; default: ret = -ENOSYS; } out: rcu_unlock_domain(d); return ret; } /* Clean up on domain destruction */ void mem_event_cleanup(struct domain *d) { if ( d->mem_event->paging.ring_page ) { /* Destroying the wait queue head means waking up all * queued vcpus. This will drain the list, allowing * the disable routine to complete. It will also drop * all domain refs the wait-queued vcpus are holding. * Finally, because this code path involves previously * pausing the domain (domain_kill), unpausing the * vcpus causes no harm. */ destroy_waitqueue_head(&d->mem_event->paging.wq); (void)mem_event_disable(d, &d->mem_event->paging); } if ( d->mem_event->access.ring_page ) { destroy_waitqueue_head(&d->mem_event->access.wq); (void)mem_event_disable(d, &d->mem_event->access); } if ( d->mem_event->share.ring_page ) { destroy_waitqueue_head(&d->mem_event->share.wq); (void)mem_event_disable(d, &d->mem_event->share); } } int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec, XEN_GUEST_HANDLE_PARAM(void) u_domctl) { int rc; rc = xsm_mem_event_control(XSM_PRIV, d, mec->mode, mec->op); if ( rc ) return rc; if ( unlikely(d == current->domain) ) { gdprintk(XENLOG_INFO, "Tried to do a memory event op on itself.\n"); return -EINVAL; } if ( unlikely(d->is_dying) ) { gdprintk(XENLOG_INFO, "Ignoring memory event op on dying domain %u\n", d->domain_id); return 0; } if ( unlikely(d->vcpu == NULL) || unlikely(d->vcpu[0] == NULL) ) { gdprintk(XENLOG_INFO, "Memory event op on a domain (%u) with no vcpus\n", d->domain_id); return -EINVAL; } rc = -ENOSYS; switch ( mec->mode ) { case XEN_DOMCTL_MEM_EVENT_OP_PAGING: { struct mem_event_domain *med = &d->mem_event->paging; rc = -EINVAL; switch( mec->op ) { case XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE: { struct p2m_domain *p2m = p2m_get_hostp2m(d); rc = -ENODEV; /* Only HAP is supported */ if ( !hap_enabled(d) ) break; /* No paging if iommu is used */ rc = -EMLINK; if ( unlikely(need_iommu(d)) ) break; rc = -EXDEV; /* Disallow paging in a PoD guest */ if ( p2m->pod.entry_count ) break; rc = mem_event_enable(d, mec, med, _VPF_mem_paging, HVM_PARAM_PAGING_RING_PFN, mem_paging_notification); } break; case XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE: { if ( med->ring_page ) rc = mem_event_disable(d, med); } break; default: rc = -ENOSYS; break; } } break; case XEN_DOMCTL_MEM_EVENT_OP_ACCESS: { struct mem_event_domain *med = &d->mem_event->access; rc = -EINVAL; switch( mec->op ) { case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE: { rc = -ENODEV; /* Only HAP is supported */ if ( !hap_enabled(d) ) break; /* Currently only EPT is supported */ if ( !cpu_has_vmx ) break; rc = mem_event_enable(d, mec, med, _VPF_mem_access, HVM_PARAM_ACCESS_RING_PFN, mem_access_notification); } break; case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE: { if ( med->ring_page ) rc = mem_event_disable(d, med); } break; default: rc = -ENOSYS; break; } } break; case XEN_DOMCTL_MEM_EVENT_OP_SHARING: { struct mem_event_domain *med = &d->mem_event->share; rc = -EINVAL; switch( mec->op ) { case XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE: { rc = -ENODEV; /* Only HAP is supported */ if ( !hap_enabled(d) ) break; rc = mem_event_enable(d, mec, med, _VPF_mem_sharing, HVM_PARAM_SHARING_RING_PFN, mem_sharing_notification); } break; case XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE: { if ( med->ring_page ) rc = mem_event_disable(d, med); } break; default: rc = -ENOSYS; break; } } break; default: rc = -ENOSYS; } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */