initial_commit

author: root <root@artemis.panaceas.org> 2015-12-25 04:40:36 +0000
committer: root <root@artemis.panaceas.org> 2015-12-25 04:40:36 +0000
commit: 849369d6c66d3054688672f97d31fceb8e8230fb (patch)
tree: 6135abc790ca67dedbe07c39806591e70eda81ce /virt
download: linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.gz
linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.bz2
linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.zip
13 files changed, 5978 insertions, 0 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
new file mode 100644
index 00000000..f63ccb0a
--- /dev/null
+++ b/virt/kvm/Kconfig
@@ -0,0 +1,20 @@
+# KVM common configuration items and defaults
+
+config HAVE_KVM
+       bool
+
+config HAVE_KVM_IRQCHIP
+       bool
+
+config HAVE_KVM_EVENTFD
+       bool
+       select EVENTFD
+
+config KVM_APIC_ARCHITECTURE
+       bool
+
+config KVM_MMIO
+       bool
+
+config KVM_ASYNC_PF
+       bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
new file mode 100644
index 00000000..6cc4b97e
--- /dev/null
+++ b/virt/kvm/assigned-dev.c
@@ -0,0 +1,797 @@
+/*
+ * Kernel-based Virtual Machine - device assignment support
+ *
+ * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include "irq.h"
+
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+						      int assigned_dev_id)
+{
+	struct list_head *ptr;
+	struct kvm_assigned_dev_kernel *match;
+
+	list_for_each(ptr, head) {
+		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+		if (match->assigned_dev_id == assigned_dev_id)
+			return match;
+	}
+	return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+				    *assigned_dev, int irq)
+{
+	int i, index;
+	struct msix_entry *host_msix_entries;
+
+	host_msix_entries = assigned_dev->host_msix_entries;
+
+	index = -1;
+	for (i = 0; i < assigned_dev->entries_nr; i++)
+		if (irq == host_msix_entries[i].vector) {
+			index = i;
+			break;
+		}
+	if (index < 0) {
+		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+		return 0;
+	}
+
+	return index;
+}
+
+static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	u32 vector;
+	int index;
+
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
+		spin_lock(&assigned_dev->intx_lock);
+		disable_irq_nosync(irq);
+		assigned_dev->host_irq_disabled = true;
+		spin_unlock(&assigned_dev->intx_lock);
+	}
+
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+		index = find_index_from_host_irq(assigned_dev, irq);
+		if (index >= 0) {
+			vector = assigned_dev->
+					guest_msix_entries[index].vector;
+			kvm_set_irq(assigned_dev->kvm,
+				    assigned_dev->irq_source_id, vector, 1);
+		}
+	} else
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+
+	return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+	struct kvm_assigned_dev_kernel *dev;
+
+	if (kian->gsi == -1)
+		return;
+
+	dev = container_of(kian, struct kvm_assigned_dev_kernel,
+			   ack_notifier);
+
+	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+
+	/* The guest irq may be shared so this ack may be
+	 * from another device.
+	 */
+	spin_lock(&dev->intx_lock);
+	if (dev->host_irq_disabled) {
+		enable_irq(dev->host_irq);
+		dev->host_irq_disabled = false;
+	}
+	spin_unlock(&dev->intx_lock);
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+			       struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+	assigned_dev->ack_notifier.gsi = -1;
+
+	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+		    assigned_dev->guest_irq, 0);
+
+	if (assigned_dev->irq_source_id != -1)
+		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+	assigned_dev->irq_source_id = -1;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+			      struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	/*
+	 * We disable irq here to prevent further events.
+	 *
+	 * Notice this maybe result in nested disable if the interrupt type is
+	 * INTx, but it's OK for we are going to free it.
+	 *
+	 * If this function is a part of VM destroy, please ensure that till
+	 * now, the kvm state is still legal for probably we also have to wait
+	 * on a currently running IRQ handler.
+	 */
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+		int i;
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			disable_irq(assigned_dev->host_msix_entries[i].vector);
+
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			free_irq(assigned_dev->host_msix_entries[i].vector,
+				 (void *)assigned_dev);
+
+		assigned_dev->entries_nr = 0;
+		kfree(assigned_dev->host_msix_entries);
+		kfree(assigned_dev->guest_msix_entries);
+		pci_disable_msix(assigned_dev->dev);
+	} else {
+		/* Deal with MSI and INTx */
+		disable_irq(assigned_dev->host_irq);
+
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+			pci_disable_msi(assigned_dev->dev);
+	}
+
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *assigned_dev,
+			    unsigned long irq_requested_type)
+{
+	unsigned long guest_irq_type, host_irq_type;
+
+	if (!irqchip_in_kernel(kvm))
+		return -EINVAL;
+	/* no irq assignment to deassign */
+	if (!assigned_dev->irq_requested_type)
+		return -ENXIO;
+
+	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+	if (host_irq_type)
+		deassign_host_irq(kvm, assigned_dev);
+	if (guest_irq_type)
+		deassign_guest_irq(kvm, assigned_dev);
+
+	return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+				  struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+				     struct kvm_assigned_dev_kernel
+				     *assigned_dev)
+{
+	kvm_free_assigned_irq(kvm, assigned_dev);
+
+	pci_reset_function(assigned_dev->dev);
+	if (pci_load_and_free_saved_state(assigned_dev->dev,
+					  &assigned_dev->pci_saved_state))
+		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+		       __func__, dev_name(&assigned_dev->dev->dev));
+	else
+		pci_restore_state(assigned_dev->dev);
+
+	pci_release_regions(assigned_dev->dev);
+	pci_disable_device(assigned_dev->dev);
+	pci_dev_put(assigned_dev->dev);
+
+	list_del(&assigned_dev->list);
+	kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+	struct list_head *ptr, *ptr2;
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+		assigned_dev = list_entry(ptr,
+					  struct kvm_assigned_dev_kernel,
+					  list);
+
+		kvm_free_assigned_device(kvm, assigned_dev);
+	}
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
+{
+	dev->host_irq = dev->dev->irq;
+	/* Even though this is PCI, we don't want to use shared
+	 * interrupts. Sharing host devices with guest-assigned devices
+	 * on the same interrupt line is not a happy situation: there
+	 * are going to be long delays in accepting, acking, etc.
+	 */
+	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+				 IRQF_ONESHOT, dev->irq_name, (void *)dev))
+		return -EIO;
+	return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+					   struct kvm_assigned_dev_kernel *dev)
+{
+	int r;
+
+	if (!dev->dev->msi_enabled) {
+		r = pci_enable_msi(dev->dev);
+		if (r)
+			return r;
+	}
+
+	dev->host_irq = dev->dev->irq;
+	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+				 0, dev->irq_name, (void *)dev)) {
+		pci_disable_msi(dev->dev);
+		return -EIO;
+	}
+
+	return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
+{
+	int i, r = -EINVAL;
+
+	/* host_msix_entries and guest_msix_entries should have been
+	 * initialized */
+	if (dev->entries_nr == 0)
+		return r;
+
+	r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+	if (r)
+		return r;
+
+	for (i = 0; i < dev->entries_nr; i++) {
+		r = request_threaded_irq(dev->host_msix_entries[i].vector,
+					 NULL, kvm_assigned_dev_thread,
+					 0, dev->irq_name, (void *)dev);
+		if (r)
+			goto err;
+	}
+
+	return 0;
+err:
+	for (i -= 1; i >= 0; i--)
+		free_irq(dev->host_msix_entries[i].vector, (void *)dev);
+	pci_disable_msix(dev->dev);
+	return r;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *dev,
+				struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = irq->guest_irq;
+	return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
+	dev->host_irq_disabled = false;
+	return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
+	dev->host_irq_disabled = false;
+	return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+			   struct kvm_assigned_dev_kernel *dev,
+			   __u32 host_irq_type)
+{
+	int r = -EEXIST;
+
+	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+		return r;
+
+	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
+		 pci_name(dev->dev));
+
+	switch (host_irq_type) {
+	case KVM_DEV_IRQ_HOST_INTX:
+		r = assigned_device_enable_host_intx(kvm, dev);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_HOST_MSI:
+		r = assigned_device_enable_host_msi(kvm, dev);
+		break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+	case KVM_DEV_IRQ_HOST_MSIX:
+		r = assigned_device_enable_host_msix(kvm, dev);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+	}
+
+	if (!r)
+		dev->irq_requested_type |= host_irq_type;
+
+	return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *dev,
+			    struct kvm_assigned_irq *irq,
+			    unsigned long guest_irq_type)
+{
+	int id;
+	int r = -EEXIST;
+
+	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+		return r;
+
+	id = kvm_request_irq_source_id(kvm);
+	if (id < 0)
+		return id;
+
+	dev->irq_source_id = id;
+
+	switch (guest_irq_type) {
+	case KVM_DEV_IRQ_GUEST_INTX:
+		r = assigned_device_enable_guest_intx(kvm, dev, irq);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_GUEST_MSI:
+		r = assigned_device_enable_guest_msi(kvm, dev, irq);
+		break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+	case KVM_DEV_IRQ_GUEST_MSIX:
+		r = assigned_device_enable_guest_msix(kvm, dev, irq);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+	}
+
+	if (!r) {
+		dev->irq_requested_type |= guest_irq_type;
+		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+	} else
+		kvm_free_irq_source_id(kvm, dev->irq_source_id);
+
+	return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+				   struct kvm_assigned_irq *assigned_irq)
+{
+	int r = -EINVAL;
+	struct kvm_assigned_dev_kernel *match;
+	unsigned long host_irq_type, guest_irq_type;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	r = -ENODEV;
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match)
+		goto out;
+
+	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+	r = -EINVAL;
+	/* can only assign one type at a time */
+	if (hweight_long(host_irq_type) > 1)
+		goto out;
+	if (hweight_long(guest_irq_type) > 1)
+		goto out;
+	if (host_irq_type == 0 && guest_irq_type == 0)
+		goto out;
+
+	r = 0;
+	if (host_irq_type)
+		r = assign_host_irq(kvm, match, host_irq_type);
+	if (r)
+		goto out;
+
+	if (guest_irq_type)
+		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+					 struct kvm_assigned_irq
+					 *assigned_irq)
+{
+	int r = -ENODEV;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match)
+		goto out;
+
+	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+				      struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0, idx;
+	struct kvm_assigned_dev_kernel *match;
+	struct pci_dev *dev;
+
+	mutex_lock(&kvm->lock);
+	idx = srcu_read_lock(&kvm->srcu);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (match) {
+		/* device already assigned */
+		r = -EEXIST;
+		goto out;
+	}
+
+	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+	if (match == NULL) {
+		printk(KERN_INFO "%s: Couldn't allocate memory\n",
+		       __func__);
+		r = -ENOMEM;
+		goto out;
+	}
+	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
+				   assigned_dev->busnr,
+				   assigned_dev->devfn);
+	if (!dev) {
+		printk(KERN_INFO "%s: host device not found\n", __func__);
+		r = -EINVAL;
+		goto out_free;
+	}
+	if (pci_enable_device(dev)) {
+		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+		r = -EBUSY;
+		goto out_put;
+	}
+	r = pci_request_regions(dev, "kvm_assigned_device");
+	if (r) {
+		printk(KERN_INFO "%s: Could not get access to device regions\n",
+		       __func__);
+		goto out_disable;
+	}
+
+	pci_reset_function(dev);
+	pci_save_state(dev);
+	match->pci_saved_state = pci_store_saved_state(dev);
+	if (!match->pci_saved_state)
+		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
+		       __func__, dev_name(&dev->dev));
+	match->assigned_dev_id = assigned_dev->assigned_dev_id;
+	match->host_segnr = assigned_dev->segnr;
+	match->host_busnr = assigned_dev->busnr;
+	match->host_devfn = assigned_dev->devfn;
+	match->flags = assigned_dev->flags;
+	match->dev = dev;
+	spin_lock_init(&match->intx_lock);
+	match->irq_source_id = -1;
+	match->kvm = kvm;
+	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+
+	list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+		if (!kvm->arch.iommu_domain) {
+			r = kvm_iommu_map_guest(kvm);
+			if (r)
+				goto out_list_del;
+		}
+		r = kvm_assign_device(kvm, match);
+		if (r)
+			goto out_list_del;
+	}
+
+out:
+	srcu_read_unlock(&kvm->srcu, idx);
+	mutex_unlock(&kvm->lock);
+	return r;
+out_list_del:
+	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
+		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+		       __func__, dev_name(&dev->dev));
+	list_del(&match->list);
+	pci_release_regions(dev);
+out_disable:
+	pci_disable_device(dev);
+out_put:
+	pci_dev_put(dev);
+out_free:
+	kfree(match);
+	srcu_read_unlock(&kvm->srcu, idx);
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+		struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (!match) {
+		printk(KERN_INFO "%s: device hasn't been assigned before, "
+		  "so cannot be deassigned\n", __func__);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+		kvm_deassign_device(kvm, match);
+
+	kvm_free_assigned_device(kvm, match);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+				    struct kvm_assigned_msix_nr *entry_nr)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry_nr->assigned_dev_id);
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_nr_out;
+	}
+
+	if (adev->entries_nr == 0) {
+		adev->entries_nr = entry_nr->entry_nr;
+		if (adev->entries_nr == 0 ||
+		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+			r = -EINVAL;
+			goto msix_nr_out;
+		}
+
+		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+						entry_nr->entry_nr,
+						GFP_KERNEL);
+		if (!adev->host_msix_entries) {
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+		adev->guest_msix_entries =
+			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
+				GFP_KERNEL);
+		if (!adev->guest_msix_entries) {
+			kfree(adev->host_msix_entries);
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+	} else /* Not allowed set MSI-X number twice */
+		r = -EINVAL;
+msix_nr_out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+				       struct kvm_assigned_msix_entry *entry)
+{
+	int r = 0, i;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry->assigned_dev_id);
+
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_entry_out;
+	}
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->guest_msix_entries[i].vector == 0 ||
+		    adev->guest_msix_entries[i].entry == entry->entry) {
+			adev->guest_msix_entries[i].entry = entry->entry;
+			adev->guest_msix_entries[i].vector = entry->gsi;
+			adev->host_msix_entries[i].entry = entry->entry;
+			break;
+		}
+	if (i == adev->entries_nr) {
+		r = -ENOSPC;
+		goto msix_entry_out;
+	}
+
+msix_entry_out:
+	mutex_unlock(&kvm->lock);
+
+	return r;
+}
+#endif
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+				  unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	switch (ioctl) {
+	case KVM_ASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_IRQ: {
+		r = -EOPNOTSUPP;
+		break;
+	}
+	case KVM_ASSIGN_DEV_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_DEASSIGN_DEV_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_DEASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
+#ifdef KVM_CAP_IRQ_ROUTING
+	case KVM_SET_GSI_ROUTING: {
+		struct kvm_irq_routing routing;
+		struct kvm_irq_routing __user *urouting;
+		struct kvm_irq_routing_entry *entries;
+
+		r = -EFAULT;
+		if (copy_from_user(&routing, argp, sizeof(routing)))
+			goto out;
+		r = -EINVAL;
+		if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+			goto out;
+		if (routing.flags)
+			goto out;
+		r = -ENOMEM;
+		entries = vmalloc(routing.nr * sizeof(*entries));
+		if (!entries)
+			goto out;
+		r = -EFAULT;
+		urouting = argp;
+		if (copy_from_user(entries, urouting->entries,
+				   routing.nr * sizeof(*entries)))
+			goto out_free_irq_routing;
+		r = kvm_set_irq_routing(kvm, entries, routing.nr,
+					routing.flags);
+	out_free_irq_routing:
+		vfree(entries);
+		break;
+	}
+#endif /* KVM_CAP_IRQ_ROUTING */
+#ifdef __KVM_HAVE_MSIX
+	case KVM_ASSIGN_SET_MSIX_NR: {
+		struct kvm_assigned_msix_nr entry_nr;
+		r = -EFAULT;
+		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_SET_MSIX_ENTRY: {
+		struct kvm_assigned_msix_entry entry;
+		r = -EFAULT;
+		if (copy_from_user(&entry, argp, sizeof entry))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
+	default:
+		r = -ENOTTY;
+		break;
+	}
+out:
+	return r;
+}
+
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
new file mode 100644
index 00000000..74268b4c
--- /dev/null
+++ b/virt/kvm/async_pf.c
@@ -0,0 +1,216 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mmu_context.h>
+
+#include "async_pf.h"
+#include <trace/events/kvm.h>
+
+static struct kmem_cache *async_pf_cache;
+
+int kvm_async_pf_init(void)
+{
+	async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
+
+	if (!async_pf_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void kvm_async_pf_deinit(void)
+{
+	if (async_pf_cache)
+		kmem_cache_destroy(async_pf_cache);
+	async_pf_cache = NULL;
+}
+
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	INIT_LIST_HEAD(&vcpu->async_pf.done);
+	INIT_LIST_HEAD(&vcpu->async_pf.queue);
+	spin_lock_init(&vcpu->async_pf.lock);
+}
+
+static void async_pf_execute(struct work_struct *work)
+{
+	struct page *page = NULL;
+	struct kvm_async_pf *apf =
+		container_of(work, struct kvm_async_pf, work);
+	struct mm_struct *mm = apf->mm;
+	struct kvm_vcpu *vcpu = apf->vcpu;
+	unsigned long addr = apf->addr;
+	gva_t gva = apf->gva;
+
+	might_sleep();
+
+	use_mm(mm);
+	down_read(&mm->mmap_sem);
+	get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
+	up_read(&mm->mmap_sem);
+	unuse_mm(mm);
+
+	spin_lock(&vcpu->async_pf.lock);
+	list_add_tail(&apf->link, &vcpu->async_pf.done);
+	apf->page = page;
+	apf->done = true;
+	spin_unlock(&vcpu->async_pf.lock);
+
+	/*
+	 * apf may be freed by kvm_check_async_pf_completion() after
+	 * this point
+	 */
+
+	trace_kvm_async_pf_completed(addr, page, gva);
+
+	if (waitqueue_active(&vcpu->wq))
+		wake_up_interruptible(&vcpu->wq);
+
+	mmdrop(mm);
+	kvm_put_kvm(vcpu->kvm);
+}
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
+	/* cancel outstanding work queue item */
+	while (!list_empty(&vcpu->async_pf.queue)) {
+		struct kvm_async_pf *work =
+			list_entry(vcpu->async_pf.queue.next,
+				   typeof(*work), queue);
+		cancel_work_sync(&work->work);
+		list_del(&work->queue);
+		if (!work->done) /* work was canceled */
+			kmem_cache_free(async_pf_cache, work);
+	}
+
+	spin_lock(&vcpu->async_pf.lock);
+	while (!list_empty(&vcpu->async_pf.done)) {
+		struct kvm_async_pf *work =
+			list_entry(vcpu->async_pf.done.next,
+				   typeof(*work), link);
+		list_del(&work->link);
+		if (work->page)
+			put_page(work->page);
+		kmem_cache_free(async_pf_cache, work);
+	}
+	spin_unlock(&vcpu->async_pf.lock);
+
+	vcpu->async_pf.queued = 0;
+}
+
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
+{
+	struct kvm_async_pf *work;
+
+	while (!list_empty_careful(&vcpu->async_pf.done) &&
+	      kvm_arch_can_inject_async_page_present(vcpu)) {
+		spin_lock(&vcpu->async_pf.lock);
+		work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
+					      link);
+		list_del(&work->link);
+		spin_unlock(&vcpu->async_pf.lock);
+
+		if (work->page)
+			kvm_arch_async_page_ready(vcpu, work);
+		kvm_arch_async_page_present(vcpu, work);
+
+		list_del(&work->queue);
+		vcpu->async_pf.queued--;
+		if (work->page)
+			put_page(work->page);
+		kmem_cache_free(async_pf_cache, work);
+	}
+}
+
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+		       struct kvm_arch_async_pf *arch)
+{
+	struct kvm_async_pf *work;
+
+	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
+		return 0;
+
+	/* setup delayed work */
+
+	/*
+	 * do alloc nowait since if we are going to sleep anyway we
+	 * may as well sleep faulting in page
+	 */
+	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
+	if (!work)
+		return 0;
+
+	work->page = NULL;
+	work->done = false;
+	work->vcpu = vcpu;
+	work->gva = gva;
+	work->addr = gfn_to_hva(vcpu->kvm, gfn);
+	work->arch = *arch;
+	work->mm = current->mm;
+	atomic_inc(&work->mm->mm_count);
+	kvm_get_kvm(work->vcpu->kvm);
+
+	/* this can't really happen otherwise gfn_to_pfn_async
+	   would succeed */
+	if (unlikely(kvm_is_error_hva(work->addr)))
+		goto retry_sync;
+
+	INIT_WORK(&work->work, async_pf_execute);
+	if (!schedule_work(&work->work))
+		goto retry_sync;
+
+	list_add_tail(&work->queue, &vcpu->async_pf.queue);
+	vcpu->async_pf.queued++;
+	kvm_arch_async_page_not_present(vcpu, work);
+	return 1;
+retry_sync:
+	kvm_put_kvm(work->vcpu->kvm);
+	mmdrop(work->mm);
+	kmem_cache_free(async_pf_cache, work);
+	return 0;
+}
+
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
+{
+	struct kvm_async_pf *work;
+
+	if (!list_empty_careful(&vcpu->async_pf.done))
+		return 0;
+
+	work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	work->page = bad_page;
+	get_page(bad_page);
+	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
+
+	spin_lock(&vcpu->async_pf.lock);
+	list_add_tail(&work->link, &vcpu->async_pf.done);
+	spin_unlock(&vcpu->async_pf.lock);
+
+	vcpu->async_pf.queued++;
+	return 0;
+}
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
new file mode 100644
index 00000000..e7ef6447
--- /dev/null
+++ b/virt/kvm/async_pf.h
@@ -0,0 +1,36 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KVM_ASYNC_PF_H__
+#define __KVM_ASYNC_PF_H__
+
+#ifdef CONFIG_KVM_ASYNC_PF
+int kvm_async_pf_init(void);
+void kvm_async_pf_deinit(void);
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
+#else
+#define kvm_async_pf_init() (0)
+#define kvm_async_pf_deinit() do{}while(0)
+#define kvm_async_pf_vcpu_init(C) do{}while(0)
+#endif
+
+#endif
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
new file mode 100644
index 00000000..fc848756
--- /dev/null
+++ b/virt/kvm/coalesced_mmio.c
@@ -0,0 +1,191 @@
+/*
+ * KVM coalesced MMIO
+ *
+ * Copyright (c) 2008 Bull S.A.S.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ */
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/kvm.h>
+
+#include "coalesced_mmio.h"
+
+static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
+{
+	return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
+}
+
+static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
+				   gpa_t addr, int len)
+{
+	struct kvm_coalesced_mmio_zone *zone;
+	struct kvm_coalesced_mmio_ring *ring;
+	unsigned avail;
+	int i;
+
+	/* Are we able to batch it ? */
+
+	/* last is the first free entry
+	 * check if we don't meet the first used entry
+	 * there is always one unused entry in the buffer
+	 */
+	ring = dev->kvm->coalesced_mmio_ring;
+	avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
+	if (avail < KVM_MAX_VCPUS) {
+		/* full */
+		return 0;
+	}
+
+	/* is it in a batchable area ? */
+
+	for (i = 0; i < dev->nb_zones; i++) {
+		zone = &dev->zone[i];
+
+		/* (addr,len) is fully included in
+		 * (zone->addr, zone->size)
+		 */
+
+		if (zone->addr <= addr &&
+		    addr + len <= zone->addr + zone->size)
+			return 1;
+	}
+	return 0;
+}
+
+static int coalesced_mmio_write(struct kvm_io_device *this,
+				gpa_t addr, int len, const void *val)
+{
+	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+	if (!coalesced_mmio_in_range(dev, addr, len))
+		return -EOPNOTSUPP;
+
+	spin_lock(&dev->lock);
+
+	/* copy data in first free entry of the ring */
+
+	ring->coalesced_mmio[ring->last].phys_addr = addr;
+	ring->coalesced_mmio[ring->last].len = len;
+	memcpy(ring->coalesced_mmio[ring->last].data, val, len);
+	smp_wmb();
+	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
+	spin_unlock(&dev->lock);
+	return 0;
+}
+
+static void coalesced_mmio_destructor(struct kvm_io_device *this)
+{
+	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
+
+	kfree(dev);
+}
+
+static const struct kvm_io_device_ops coalesced_mmio_ops = {
+	.write      = coalesced_mmio_write,
+	.destructor = coalesced_mmio_destructor,
+};
+
+int kvm_coalesced_mmio_init(struct kvm *kvm)
+{
+	struct kvm_coalesced_mmio_dev *dev;
+	struct page *page;
+	int ret;
+
+	ret = -ENOMEM;
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		goto out_err;
+	kvm->coalesced_mmio_ring = page_address(page);
+
+	ret = -ENOMEM;
+	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+	if (!dev)
+		goto out_free_page;
+	spin_lock_init(&dev->lock);
+	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+	dev->kvm = kvm;
+	kvm->coalesced_mmio_dev = dev;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+	mutex_unlock(&kvm->slots_lock);
+	if (ret < 0)
+		goto out_free_dev;
+
+	return ret;
+
+out_free_dev:
+	kvm->coalesced_mmio_dev = NULL;
+	kfree(dev);
+out_free_page:
+	kvm->coalesced_mmio_ring = NULL;
+	__free_page(page);
+out_err:
+	return ret;
+}
+
+void kvm_coalesced_mmio_free(struct kvm *kvm)
+{
+	if (kvm->coalesced_mmio_ring)
+		free_page((unsigned long)kvm->coalesced_mmio_ring);
+}
+
+int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
+					 struct kvm_coalesced_mmio_zone *zone)
+{
+	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+
+	if (dev == NULL)
+		return -ENXIO;
+
+	mutex_lock(&kvm->slots_lock);
+	if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
+		mutex_unlock(&kvm->slots_lock);
+		return -ENOBUFS;
+	}
+
+	dev->zone[dev->nb_zones] = *zone;
+	dev->nb_zones++;
+
+	mutex_unlock(&kvm->slots_lock);
+	return 0;
+}
+
+int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
+					   struct kvm_coalesced_mmio_zone *zone)
+{
+	int i;
+	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+	struct kvm_coalesced_mmio_zone *z;
+
+	if (dev == NULL)
+		return -ENXIO;
+
+	mutex_lock(&kvm->slots_lock);
+
+	i = dev->nb_zones;
+	while (i) {
+		z = &dev->zone[i - 1];
+
+		/* unregister all zones
+		 * included in (zone->addr, zone->size)
+		 */
+
+		if (zone->addr <= z->addr &&
+		    z->addr + z->size <= zone->addr + zone->size) {
+			dev->nb_zones--;
+			*z = dev->zone[dev->nb_zones];
+		}
+		i--;
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return 0;
+}
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
new file mode 100644
index 00000000..8a5959e3
--- /dev/null
+++ b/virt/kvm/coalesced_mmio.h
@@ -0,0 +1,39 @@
+#ifndef __KVM_COALESCED_MMIO_H__
+#define __KVM_COALESCED_MMIO_H__
+
+/*
+ * KVM coalesced MMIO
+ *
+ * Copyright (c) 2008 Bull S.A.S.
+ *
+ *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ */
+
+#ifdef CONFIG_KVM_MMIO
+
+#define KVM_COALESCED_MMIO_ZONE_MAX 100
+
+struct kvm_coalesced_mmio_dev {
+	struct kvm_io_device dev;
+	struct kvm *kvm;
+	spinlock_t lock;
+	int nb_zones;
+	struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+};
+
+int kvm_coalesced_mmio_init(struct kvm *kvm);
+void kvm_coalesced_mmio_free(struct kvm *kvm);
+int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
+                                       struct kvm_coalesced_mmio_zone *zone);
+int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
+                                         struct kvm_coalesced_mmio_zone *zone);
+
+#else
+
+static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; }
+static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { }
+
+#endif
+
+#endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
new file mode 100644
index 00000000..73358d25
--- /dev/null
+++ b/virt/kvm/eventfd.c
@@ -0,0 +1,656 @@
+/*
+ * kvm eventfd support - use eventfd objects to signal various KVM events
+ *
+ * Copyright 2009 Novell.  All Rights Reserved.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ *	Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/workqueue.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/eventfd.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "iodev.h"
+
+/*
+ * --------------------------------------------------------------------
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest
+ *
+ * Credit goes to Avi Kivity for the original idea.
+ * --------------------------------------------------------------------
+ */
+
+struct _irqfd {
+	/* Used for MSI fast-path */
+	struct kvm *kvm;
+	wait_queue_t wait;
+	/* Update side is protected by irqfds.lock */
+	struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
+	/* Used for level IRQ fast-path */
+	int gsi;
+	struct work_struct inject;
+	/* Used for setup/shutdown */
+	struct eventfd_ctx *eventfd;
+	struct list_head list;
+	poll_table pt;
+	struct work_struct shutdown;
+};
+
+static struct workqueue_struct *irqfd_cleanup_wq;
+
+static void
+irqfd_inject(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+	struct kvm *kvm = irqfd->kvm;
+
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+}
+
+/*
+ * Race-free decouple logic (ordering is critical)
+ */
+static void
+irqfd_shutdown(struct work_struct *work)
+{
+	struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+	u64 cnt;
+
+	/*
+	 * Synchronize with the wait-queue and unhook ourselves to prevent
+	 * further events.
+	 */
+	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
+
+	/*
+	 * We know no new events will be scheduled at this point, so block
+	 * until all previously outstanding events have completed
+	 */
+	flush_work_sync(&irqfd->inject);
+
+	/*
+	 * It is now safe to release the object's resources
+	 */
+	eventfd_ctx_put(irqfd->eventfd);
+	kfree(irqfd);
+}
+
+
+/* assumes kvm->irqfds.lock is held */
+static bool
+irqfd_is_active(struct _irqfd *irqfd)
+{
+	return list_empty(&irqfd->list) ? false : true;
+}
+
+/*
+ * Mark the irqfd as inactive and schedule it for removal
+ *
+ * assumes kvm->irqfds.lock is held
+ */
+static void
+irqfd_deactivate(struct _irqfd *irqfd)
+{
+	BUG_ON(!irqfd_is_active(irqfd));
+
+	list_del_init(&irqfd->list);
+
+	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int
+irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+	unsigned long flags = (unsigned long)key;
+	struct kvm_kernel_irq_routing_entry *irq;
+	struct kvm *kvm = irqfd->kvm;
+
+	if (flags & POLLIN) {
+		rcu_read_lock();
+		irq = rcu_dereference(irqfd->irq_entry);
+		/* An event has been signaled, inject an interrupt */
+		if (irq)
+			kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+		else
+			schedule_work(&irqfd->inject);
+		rcu_read_unlock();
+	}
+
+	if (flags & POLLHUP) {
+		/* The eventfd is closing, detach from KVM */
+		unsigned long flags;
+
+		spin_lock_irqsave(&kvm->irqfds.lock, flags);
+
+		/*
+		 * We must check if someone deactivated the irqfd before
+		 * we could acquire the irqfds.lock since the item is
+		 * deactivated from the KVM side before it is unhooked from
+		 * the wait-queue.  If it is already deactivated, we can
+		 * simply return knowing the other side will cleanup for us.
+		 * We cannot race against the irqfd going away since the
+		 * other side is required to acquire wqh->lock, which we hold
+		 */
+		if (irqfd_is_active(irqfd))
+			irqfd_deactivate(irqfd);
+
+		spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
+	}
+
+	return 0;
+}
+
+static void
+irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
+			poll_table *pt)
+{
+	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+	add_wait_queue(wqh, &irqfd->wait);
+}
+
+/* Must be called under irqfds.lock */
+static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
+			 struct kvm_irq_routing_table *irq_rt)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct hlist_node *n;
+
+	if (irqfd->gsi >= irq_rt->nr_rt_entries) {
+		rcu_assign_pointer(irqfd->irq_entry, NULL);
+		return;
+	}
+
+	hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
+		/* Only fast-path MSI. */
+		if (e->type == KVM_IRQ_ROUTING_MSI)
+			rcu_assign_pointer(irqfd->irq_entry, e);
+		else
+			rcu_assign_pointer(irqfd->irq_entry, NULL);
+	}
+}
+
+static int
+kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
+{
+	struct kvm_irq_routing_table *irq_rt;
+	struct _irqfd *irqfd, *tmp;
+	struct file *file = NULL;
+	struct eventfd_ctx *eventfd = NULL;
+	int ret;
+	unsigned int events;
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!irqfd)
+		return -ENOMEM;
+
+	irqfd->kvm = kvm;
+	irqfd->gsi = gsi;
+	INIT_LIST_HEAD(&irqfd->list);
+	INIT_WORK(&irqfd->inject, irqfd_inject);
+	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	eventfd = eventfd_ctx_fileget(file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	irqfd->eventfd = eventfd;
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone signals the underlying eventfd
+	 */
+	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
+	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	ret = 0;
+	list_for_each_entry(tmp, &kvm->irqfds.items, list) {
+		if (irqfd->eventfd != tmp->eventfd)
+			continue;
+		/* This fd is used for another irq already. */
+		ret = -EBUSY;
+		spin_unlock_irq(&kvm->irqfds.lock);
+		goto fail;
+	}
+
+	irq_rt = rcu_dereference_protected(kvm->irq_routing,
+					   lockdep_is_held(&kvm->irqfds.lock));
+	irqfd_update(kvm, irqfd, irq_rt);
+
+	events = file->f_op->poll(file, &irqfd->pt);
+
+	list_add_tail(&irqfd->list, &kvm->irqfds.items);
+
+	/*
+	 * Check if there was an event already pending on the eventfd
+	 * before we registered, and trigger it as if we didn't miss it.
+	 */
+	if (events & POLLIN)
+		schedule_work(&irqfd->inject);
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+	/*
+	 * do not drop the file until the irqfd is fully initialized, otherwise
+	 * we might race against the POLLHUP
+	 */
+	fput(file);
+
+	return 0;
+
+fail:
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	if (!IS_ERR(file))
+		fput(file);
+
+	kfree(irqfd);
+	return ret;
+}
+
+void
+kvm_eventfd_init(struct kvm *kvm)
+{
+	spin_lock_init(&kvm->irqfds.lock);
+	INIT_LIST_HEAD(&kvm->irqfds.items);
+	INIT_LIST_HEAD(&kvm->ioeventfds);
+}
+
+/*
+ * shutdown any irqfd's that match fd+gsi
+ */
+static int
+kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
+{
+	struct _irqfd *irqfd, *tmp;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
+		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
+			/*
+			 * This rcu_assign_pointer is needed for when
+			 * another thread calls kvm_irq_routing_update before
+			 * we flush workqueue below (we synchronize with
+			 * kvm_irq_routing_update using irqfds.lock).
+			 * It is paired with synchronize_rcu done by caller
+			 * of that function.
+			 */
+			rcu_assign_pointer(irqfd->irq_entry, NULL);
+			irqfd_deactivate(irqfd);
+		}
+	}
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+	eventfd_ctx_put(eventfd);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * so that we guarantee there will not be any more interrupts on this
+	 * gsi once this deassign function returns.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	return 0;
+}
+
+int
+kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
+{
+	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
+		return kvm_irqfd_deassign(kvm, fd, gsi);
+
+	return kvm_irqfd_assign(kvm, fd, gsi);
+}
+
+/*
+ * This function is called as the kvm VM fd is being released. Shutdown all
+ * irqfds that still remain open
+ */
+void
+kvm_irqfd_release(struct kvm *kvm)
+{
+	struct _irqfd *irqfd, *tmp;
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
+		irqfd_deactivate(irqfd);
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * since we do not take a kvm* reference.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+}
+
+/*
+ * Change irq_routing and irqfd.
+ * Caller must invoke synchronize_rcu afterwards.
+ */
+void kvm_irq_routing_update(struct kvm *kvm,
+			    struct kvm_irq_routing_table *irq_rt)
+{
+	struct _irqfd *irqfd;
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	rcu_assign_pointer(kvm->irq_routing, irq_rt);
+
+	list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+		irqfd_update(kvm, irqfd, irq_rt);
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+/*
+ * create a host-wide workqueue for issuing deferred shutdown requests
+ * aggregated from all vm* instances. We need our own isolated single-thread
+ * queue to prevent deadlock against flushing the normal work-queue.
+ */
+static int __init irqfd_module_init(void)
+{
+	irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
+	if (!irqfd_cleanup_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit irqfd_module_exit(void)
+{
+	destroy_workqueue(irqfd_cleanup_wq);
+}
+
+module_init(irqfd_module_init);
+module_exit(irqfd_module_exit);
+
+/*
+ * --------------------------------------------------------------------
+ * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a PIO/MMIO address with an eventfd for receiving
+ * notification when the memory has been touched.
+ * --------------------------------------------------------------------
+ */
+
+struct _ioeventfd {
+	struct list_head     list;
+	u64                  addr;
+	int                  length;
+	struct eventfd_ctx  *eventfd;
+	u64                  datamatch;
+	struct kvm_io_device dev;
+	bool                 wildcard;
+};
+
+static inline struct _ioeventfd *
+to_ioeventfd(struct kvm_io_device *dev)
+{
+	return container_of(dev, struct _ioeventfd, dev);
+}
+
+static void
+ioeventfd_release(struct _ioeventfd *p)
+{
+	eventfd_ctx_put(p->eventfd);
+	list_del(&p->list);
+	kfree(p);
+}
+
+static bool
+ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
+{
+	u64 _val;
+
+	if (!(addr == p->addr && len == p->length))
+		/* address-range must be precise for a hit */
+		return false;
+
+	if (p->wildcard)
+		/* all else equal, wildcard is always a hit */
+		return true;
+
+	/* otherwise, we have to actually compare the data */
+
+	BUG_ON(!IS_ALIGNED((unsigned long)val, len));
+
+	switch (len) {
+	case 1:
+		_val = *(u8 *)val;
+		break;
+	case 2:
+		_val = *(u16 *)val;
+		break;
+	case 4:
+		_val = *(u32 *)val;
+		break;
+	case 8:
+		_val = *(u64 *)val;
+		break;
+	default:
+		return false;
+	}
+
+	return _val == p->datamatch ? true : false;
+}
+
+/* MMIO/PIO writes trigger an event if the addr/val match */
+static int
+ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
+		const void *val)
+{
+	struct _ioeventfd *p = to_ioeventfd(this);
+
+	if (!ioeventfd_in_range(p, addr, len, val))
+		return -EOPNOTSUPP;
+
+	eventfd_signal(p->eventfd, 1);
+	return 0;
+}
+
+/*
+ * This function is called as KVM is completely shutting down.  We do not
+ * need to worry about locking just nuke anything we have as quickly as possible
+ */
+static void
+ioeventfd_destructor(struct kvm_io_device *this)
+{
+	struct _ioeventfd *p = to_ioeventfd(this);
+
+	ioeventfd_release(p);
+}
+
+static const struct kvm_io_device_ops ioeventfd_ops = {
+	.write      = ioeventfd_write,
+	.destructor = ioeventfd_destructor,
+};
+
+/* assumes kvm->slots_lock held */
+static bool
+ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
+{
+	struct _ioeventfd *_p;
+
+	list_for_each_entry(_p, &kvm->ioeventfds, list)
+		if (_p->addr == p->addr && _p->length == p->length &&
+		    (_p->wildcard || p->wildcard ||
+		     _p->datamatch == p->datamatch))
+			return true;
+
+	return false;
+}
+
+static int
+kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
+	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
+	struct _ioeventfd        *p;
+	struct eventfd_ctx       *eventfd;
+	int                       ret;
+
+	/* must be natural-word sized */
+	switch (args->len) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* check for range overflow */
+	if (args->addr + args->len < args->addr)
+		return -EINVAL;
+
+	/* check for extra flags that we don't understand */
+	if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
+		return -EINVAL;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&p->list);
+	p->addr    = args->addr;
+	p->length  = args->len;
+	p->eventfd = eventfd;
+
+	/* The datamatch feature is optional, otherwise this is a wildcard */
+	if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
+		p->datamatch = args->datamatch;
+	else
+		p->wildcard = true;
+
+	mutex_lock(&kvm->slots_lock);
+
+	/* Verify that there isn't a match already */
+	if (ioeventfd_check_collision(kvm, p)) {
+		ret = -EEXIST;
+		goto unlock_fail;
+	}
+
+	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
+
+	ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
+	if (ret < 0)
+		goto unlock_fail;
+
+	list_add_tail(&p->list, &kvm->ioeventfds);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	return 0;
+
+unlock_fail:
+	mutex_unlock(&kvm->slots_lock);
+
+fail:
+	kfree(p);
+	eventfd_ctx_put(eventfd);
+
+	return ret;
+}
+
+static int
+kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
+	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
+	struct _ioeventfd        *p, *tmp;
+	struct eventfd_ctx       *eventfd;
+	int                       ret = -ENOENT;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&kvm->slots_lock);
+
+	list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
+		bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+
+		if (p->eventfd != eventfd  ||
+		    p->addr != args->addr  ||
+		    p->length != args->len ||
+		    p->wildcard != wildcard)
+			continue;
+
+		if (!p->wildcard && p->datamatch != args->datamatch)
+			continue;
+
+		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+		ioeventfd_release(p);
+		ret = 0;
+		break;
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+
+	eventfd_ctx_put(eventfd);
+
+	return ret;
+}
+
+int
+kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
+{
+	if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
+		return kvm_deassign_ioeventfd(kvm, args);
+
+	return kvm_assign_ioeventfd(kvm, args);
+}
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
new file mode 100644
index 00000000..8df1ca10
--- /dev/null
+++ b/virt/kvm/ioapic.c
@@ -0,0 +1,441 @@
+/*
+ *  Copyright (C) 2001  MandrakeSoft S.A.
+ *  Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ *    MandrakeSoft S.A.
+ *    43, rue d'Aboukir
+ *    75002 Paris - France
+ *    http://www.linux-mandrake.com/
+ *    http://www.mandrakesoft.com/
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Yunhong Jiang <yunhong.jiang@intel.com>
+ *  Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *  Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <trace/events/kvm.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+					  unsigned long addr,
+					  unsigned long length)
+{
+	unsigned long result = 0;
+
+	switch (ioapic->ioregsel) {
+	case IOAPIC_REG_VERSION:
+		result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+			  | (IOAPIC_VERSION_ID & 0xff));
+		break;
+
+	case IOAPIC_REG_APIC_ID:
+	case IOAPIC_REG_ARB_ID:
+		result = ((ioapic->id & 0xf) << 24);
+		break;
+
+	default:
+		{
+			u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+			u64 redir_content;
+
+			ASSERT(redir_index < IOAPIC_NUM_PINS);
+
+			redir_content = ioapic->redirtbl[redir_index].bits;
+			result = (ioapic->ioregsel & 0x1) ?
+			    (redir_content >> 32) & 0xffffffff :
+			    redir_content & 0xffffffff;
+			break;
+		}
+	}
+
+	return result;
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+{
+	union kvm_ioapic_redirect_entry *pent;
+	int injected = -1;
+
+	pent = &ioapic->redirtbl[idx];
+
+	if (!pent->fields.mask) {
+		injected = ioapic_deliver(ioapic, idx);
+		if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+			pent->fields.remote_irr = 1;
+	}
+
+	return injected;
+}
+
+static void update_handled_vectors(struct kvm_ioapic *ioapic)
+{
+	DECLARE_BITMAP(handled_vectors, 256);
+	int i;
+
+	memset(handled_vectors, 0, sizeof(handled_vectors));
+	for (i = 0; i < IOAPIC_NUM_PINS; ++i)
+		__set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
+	memcpy(ioapic->handled_vectors, handled_vectors,
+	       sizeof(handled_vectors));
+	smp_wmb();
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+	unsigned index;
+	bool mask_before, mask_after;
+	union kvm_ioapic_redirect_entry *e;
+
+	switch (ioapic->ioregsel) {
+	case IOAPIC_REG_VERSION:
+		/* Writes are ignored. */
+		break;
+
+	case IOAPIC_REG_APIC_ID:
+		ioapic->id = (val >> 24) & 0xf;
+		break;
+
+	case IOAPIC_REG_ARB_ID:
+		break;
+
+	default:
+		index = (ioapic->ioregsel - 0x10) >> 1;
+
+		ioapic_debug("change redir index %x val %x\n", index, val);
+		if (index >= IOAPIC_NUM_PINS)
+			return;
+		e = &ioapic->redirtbl[index];
+		mask_before = e->fields.mask;
+		if (ioapic->ioregsel & 1) {
+			e->bits &= 0xffffffff;
+			e->bits |= (u64) val << 32;
+		} else {
+			e->bits &= ~0xffffffffULL;
+			e->bits |= (u32) val;
+			e->fields.remote_irr = 0;
+		}
+		update_handled_vectors(ioapic);
+		mask_after = e->fields.mask;
+		if (mask_before != mask_after)
+			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
+		    && ioapic->irr & (1 << index))
+			ioapic_service(ioapic, index);
+		break;
+	}
+}
+
+static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+	struct kvm_lapic_irq irqe;
+
+	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+		     "vector=%x trig_mode=%x\n",
+		     entry->fields.dest_id, entry->fields.dest_mode,
+		     entry->fields.delivery_mode, entry->fields.vector,
+		     entry->fields.trig_mode);
+
+	irqe.dest_id = entry->fields.dest_id;
+	irqe.vector = entry->fields.vector;
+	irqe.dest_mode = entry->fields.dest_mode;
+	irqe.trig_mode = entry->fields.trig_mode;
+	irqe.delivery_mode = entry->fields.delivery_mode << 8;
+	irqe.level = 1;
+	irqe.shorthand = 0;
+
+#ifdef CONFIG_X86
+	/* Always delivery PIT interrupt to vcpu 0 */
+	if (irq == 0) {
+		irqe.dest_mode = 0; /* Physical mode. */
+		/* need to read apic_id from apic regiest since
+		 * it can be rewritten */
+		irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
+	}
+#endif
+	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
+}
+
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+{
+	u32 old_irr;
+	u32 mask = 1 << irq;
+	union kvm_ioapic_redirect_entry entry;
+	int ret = 1;
+
+	spin_lock(&ioapic->lock);
+	old_irr = ioapic->irr;
+	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
+		entry = ioapic->redirtbl[irq];
+		level ^= entry.fields.polarity;
+		if (!level)
+			ioapic->irr &= ~mask;
+		else {
+			int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+			ioapic->irr |= mask;
+			if ((edge && old_irr != ioapic->irr) ||
+			    (!edge && !entry.fields.remote_irr))
+				ret = ioapic_service(ioapic, irq);
+			else
+				ret = 0; /* report coalesced interrupt */
+		}
+		trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+	}
+	spin_unlock(&ioapic->lock);
+
+	return ret;
+}
+
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
+				     int trigger_mode)
+{
+	int i;
+
+	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+		if (ent->fields.vector != vector)
+			continue;
+
+		/*
+		 * We are dropping lock while calling ack notifiers because ack
+		 * notifier callbacks for assigned devices call into IOAPIC
+		 * recursively. Since remote_irr is cleared only after call
+		 * to notifiers if the same vector will be delivered while lock
+		 * is dropped it will be put into irr and will be delivered
+		 * after ack notifier returns.
+		 */
+		spin_unlock(&ioapic->lock);
+		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+		spin_lock(&ioapic->lock);
+
+		if (trigger_mode != IOAPIC_LEVEL_TRIG)
+			continue;
+
+		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+		ent->fields.remote_irr = 0;
+		if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+			ioapic_service(ioapic, i);
+	}
+}
+
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+	smp_rmb();
+	if (!test_bit(vector, ioapic->handled_vectors))
+		return;
+	spin_lock(&ioapic->lock);
+	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
+	spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+	return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+	return ((addr >= ioapic->base_address &&
+		 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+			    void *val)
+{
+	struct kvm_ioapic *ioapic = to_ioapic(this);
+	u32 result;
+	if (!ioapic_in_range(ioapic, addr))
+		return -EOPNOTSUPP;
+
+	ioapic_debug("addr %lx\n", (unsigned long)addr);
+	ASSERT(!(addr & 0xf));	/* check alignment */
+
+	addr &= 0xff;
+	spin_lock(&ioapic->lock);
+	switch (addr) {
+	case IOAPIC_REG_SELECT:
+		result = ioapic->ioregsel;
+		break;
+
+	case IOAPIC_REG_WINDOW:
+		result = ioapic_read_indirect(ioapic, addr, len);
+		break;
+
+	default:
+		result = 0;
+		break;
+	}
+	spin_unlock(&ioapic->lock);
+
+	switch (len) {
+	case 8:
+		*(u64 *) val = result;
+		break;
+	case 1:
+	case 2:
+	case 4:
+		memcpy(val, (char *)&result, len);
+		break;
+	default:
+		printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+	}
+	return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+			     const void *val)
+{
+	struct kvm_ioapic *ioapic = to_ioapic(this);
+	u32 data;
+	if (!ioapic_in_range(ioapic, addr))
+		return -EOPNOTSUPP;
+
+	ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+		     (void*)addr, len, val);
+	ASSERT(!(addr & 0xf));	/* check alignment */
+
+	if (len == 4 || len == 8)
+		data = *(u32 *) val;
+	else {
+		printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+		return 0;
+	}
+
+	addr &= 0xff;
+	spin_lock(&ioapic->lock);
+	switch (addr) {
+	case IOAPIC_REG_SELECT:
+		ioapic->ioregsel = data;
+		break;
+
+	case IOAPIC_REG_WINDOW:
+		ioapic_write_indirect(ioapic, data);
+		break;
+#ifdef	CONFIG_IA64
+	case IOAPIC_REG_EOI:
+		__kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG);
+		break;
+#endif
+
+	default:
+		break;
+	}
+	spin_unlock(&ioapic->lock);
+	return 0;
+}
+
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+	int i;
+
+	for (i = 0; i < IOAPIC_NUM_PINS; i++)
+		ioapic->redirtbl[i].fields.mask = 1;
+	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+	ioapic->ioregsel = 0;
+	ioapic->irr = 0;
+	ioapic->id = 0;
+	update_handled_vectors(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+	.read     = ioapic_mmio_read,
+	.write    = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+	struct kvm_ioapic *ioapic;
+	int ret;
+
+	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+	if (!ioapic)
+		return -ENOMEM;
+	spin_lock_init(&ioapic->lock);
+	kvm->arch.vioapic = ioapic;
+	kvm_ioapic_reset(ioapic);
+	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+	ioapic->kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+	mutex_unlock(&kvm->slots_lock);
+	if (ret < 0) {
+		kvm->arch.vioapic = NULL;
+		kfree(ioapic);
+	}
+
+	return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+	if (ioapic) {
+		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+		kvm->arch.vioapic = NULL;
+		kfree(ioapic);
+	}
+}
+
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	if (!ioapic)
+		return -EINVAL;
+
+	spin_lock(&ioapic->lock);
+	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+	spin_unlock(&ioapic->lock);
+	return 0;
+}
+
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	if (!ioapic)
+		return -EINVAL;
+
+	spin_lock(&ioapic->lock);
+	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+	update_handled_vectors(ioapic);
+	spin_unlock(&ioapic->lock);
+	return 0;
+}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
new file mode 100644
index 00000000..0b190c34
--- /dev/null
+++ b/virt/kvm/ioapic.h
@@ -0,0 +1,83 @@
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
+#define IOAPIC_EDGE_TRIG  0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
+#define IOAPIC_MEM_LENGTH            0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT  0x00
+#define IOAPIC_REG_WINDOW  0x10
+#define IOAPIC_REG_EOI     0x40	/* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00	/* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID  0x02	/* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define	IOAPIC_FIXED			0x0
+#define	IOAPIC_LOWEST_PRIORITY		0x1
+#define	IOAPIC_PMI			0x2
+#define	IOAPIC_NMI			0x4
+#define	IOAPIC_INIT			0x5
+#define	IOAPIC_EXTINT			0x7
+
+struct kvm_ioapic {
+	u64 base_address;
+	u32 ioregsel;
+	u32 id;
+	u32 irr;
+	u32 pad;
+	union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+	unsigned long irq_states[IOAPIC_NUM_PINS];
+	struct kvm_io_device dev;
+	struct kvm *kvm;
+	void (*ack_notifier)(void *opaque, int irq);
+	spinlock_t lock;
+	DECLARE_BITMAP(handled_vectors, 256);
+};
+
+#ifdef DEBUG
+#define ASSERT(x)  							\
+do {									\
+	if (!(x)) {							\
+		printk(KERN_EMERG "assertion failed %s: %d: %s\n",	\
+		       __FILE__, __LINE__, #x);				\
+		BUG();							\
+	}								\
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+	return kvm->arch.vioapic;
+}
+
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq);
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+
+#endif
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
new file mode 100644
index 00000000..12fd3caf
--- /dev/null
+++ b/virt/kvm/iodev.h
@@ -0,0 +1,70 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include <linux/kvm_types.h>
+#include <asm/errno.h>
+
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
+static inline void kvm_iodevice_init(struct kvm_io_device *dev,
+				     const struct kvm_io_device_ops *ops)
+{
+	dev->ops = ops;
+}
+
+static inline int kvm_iodevice_read(struct kvm_io_device *dev,
+				    gpa_t addr, int l, void *v)
+{
+	return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
+}
+
+static inline int kvm_iodevice_write(struct kvm_io_device *dev,
+				     gpa_t addr, int l, const void *v)
+{
+	return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+	if (dev->ops->destructor)
+		dev->ops->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
new file mode 100644
index 00000000..fb0f6e46
--- /dev/null
+++ b/virt/kvm/iommu.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+				gfn_t base_gfn, unsigned long npages);
+
+static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
+			   gfn_t gfn, unsigned long size)
+{
+	gfn_t end_gfn;
+	pfn_t pfn;
+
+	pfn     = gfn_to_pfn_memslot(kvm, slot, gfn);
+	end_gfn = gfn + (size >> PAGE_SHIFT);
+	gfn    += 1;
+
+	if (is_error_pfn(pfn))
+		return pfn;
+
+	while (gfn < end_gfn)
+		gfn_to_pfn_memslot(kvm, slot, gfn++);
+
+	return pfn;
+}
+
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	gfn_t gfn, end_gfn;
+	pfn_t pfn;
+	int r = 0;
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	int flags;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	gfn     = slot->base_gfn;
+	end_gfn = gfn + slot->npages;
+
+	flags = IOMMU_READ | IOMMU_WRITE;
+	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+		flags |= IOMMU_CACHE;
+
+
+	while (gfn < end_gfn) {
+		unsigned long page_size;
+
+		/* Check if already mapped */
+		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
+			gfn += 1;
+			continue;
+		}
+
+		/* Get the page size we could use to map */
+		page_size = kvm_host_page_size(kvm, gfn);
+
+		/* Make sure the page_size does not exceed the memslot */
+		while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
+			page_size >>= 1;
+
+		/* Make sure gfn is aligned to the page size we want to map */
+		while ((gfn << PAGE_SHIFT) & (page_size - 1))
+			page_size >>= 1;
+
+		/*
+		 * Pin all pages we are about to map in memory. This is
+		 * important because we unmap and unpin in 4kb steps later.
+		 */
+		pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+		if (is_error_pfn(pfn)) {
+			gfn += 1;
+			continue;
+		}
+
+		/* Map into IO address space */
+		r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
+			      get_order(page_size), flags);
+		if (r) {
+			printk(KERN_ERR "kvm_iommu_map_address:"
+			       "iommu failed to map pfn=%llx\n", pfn);
+			goto unmap_pages;
+		}
+
+		gfn += page_size >> PAGE_SHIFT;
+
+
+	}
+
+	return 0;
+
+unmap_pages:
+	kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
+	return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+	int i, idx, r = 0;
+	struct kvm_memslots *slots;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+
+	for (i = 0; i < slots->nmemslots; i++) {
+		r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
+		if (r)
+			break;
+	}
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return r;
+}
+
+int kvm_assign_device(struct kvm *kvm,
+		      struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	int r, last_flags;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	pdev = assigned_dev->dev;
+	if (pdev == NULL)
+		return -ENODEV;
+
+	r = iommu_attach_device(domain, &pdev->dev);
+	if (r) {
+		printk(KERN_ERR "assign device %x:%x:%x.%x failed",
+			pci_domain_nr(pdev->bus),
+			pdev->bus->number,
+			PCI_SLOT(pdev->devfn),
+			PCI_FUNC(pdev->devfn));
+		return r;
+	}
+
+	last_flags = kvm->arch.iommu_flags;
+	if (iommu_domain_has_cap(kvm->arch.iommu_domain,
+				 IOMMU_CAP_CACHE_COHERENCY))
+		kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+
+	/* Check if need to update IOMMU page table for guest memory */
+	if ((last_flags ^ kvm->arch.iommu_flags) ==
+			KVM_IOMMU_CACHE_COHERENCY) {
+		kvm_iommu_unmap_memslots(kvm);
+		r = kvm_iommu_map_memslots(kvm);
+		if (r)
+			goto out_unmap;
+	}
+
+	printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
+		assigned_dev->host_segnr,
+		assigned_dev->host_busnr,
+		PCI_SLOT(assigned_dev->host_devfn),
+		PCI_FUNC(assigned_dev->host_devfn));
+
+	return 0;
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
+}
+
+int kvm_deassign_device(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	struct pci_dev *pdev = NULL;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	pdev = assigned_dev->dev;
+	if (pdev == NULL)
+		return -ENODEV;
+
+	iommu_detach_device(domain, &pdev->dev);
+
+	printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
+		assigned_dev->host_segnr,
+		assigned_dev->host_busnr,
+		PCI_SLOT(assigned_dev->host_devfn),
+		PCI_FUNC(assigned_dev->host_devfn));
+
+	return 0;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm)
+{
+	int r;
+
+	if (!iommu_found()) {
+		printk(KERN_ERR "%s: iommu not found\n", __func__);
+		return -ENODEV;
+	}
+
+	kvm->arch.iommu_domain = iommu_domain_alloc();
+	if (!kvm->arch.iommu_domain)
+		return -ENOMEM;
+
+	r = kvm_iommu_map_memslots(kvm);
+	if (r)
+		goto out_unmap;
+
+	return 0;
+
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
+}
+
+static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i)
+		kvm_release_pfn_clean(pfn + i);
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+				gfn_t base_gfn, unsigned long npages)
+{
+	struct iommu_domain *domain;
+	gfn_t end_gfn, gfn;
+	pfn_t pfn;
+	u64 phys;
+
+	domain  = kvm->arch.iommu_domain;
+	end_gfn = base_gfn + npages;
+	gfn     = base_gfn;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return;
+
+	while (gfn < end_gfn) {
+		unsigned long unmap_pages;
+		int order;
+
+		/* Get physical address */
+		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+		pfn  = phys >> PAGE_SHIFT;
+
+		/* Unmap address from IO address space */
+		order       = iommu_unmap(domain, gfn_to_gpa(gfn), 0);
+		unmap_pages = 1ULL << order;
+
+		/* Unpin all pages we just unmapped to not leak any memory */
+		kvm_unpin_pages(kvm, pfn, unmap_pages);
+
+		gfn += unmap_pages;
+	}
+}
+
+void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+	int i, idx;
+	struct kvm_memslots *slots;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+
+	for (i = 0; i < slots->nmemslots; i++)
+		kvm_iommu_unmap_pages(kvm, &slots->memslots[i]);
+
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	kvm_iommu_unmap_memslots(kvm);
+	iommu_domain_free(domain);
+	return 0;
+}
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
new file mode 100644
index 00000000..9f614b4e
--- /dev/null
+++ b/virt/kvm/irq_comm.c
@@ -0,0 +1,474 @@
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <trace/events/kvm.h>
+
+#include <asm/msidef.h>
+#ifdef CONFIG_IA64
+#include <asm/iosapic.h>
+#endif
+
+#include "irq.h"
+
+#include "ioapic.h"
+
+static inline int kvm_irq_line_state(unsigned long *irq_state,
+				     int irq_source_id, int level)
+{
+	/* Logical OR for level trig interrupt */
+	if (level)
+		set_bit(irq_source_id, irq_state);
+	else
+		clear_bit(irq_source_id, irq_state);
+
+	return !!(*irq_state);
+}
+
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+			   struct kvm *kvm, int irq_source_id, int level)
+{
+#ifdef CONFIG_X86
+	struct kvm_pic *pic = pic_irqchip(kvm);
+	level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
+				   irq_source_id, level);
+	return kvm_pic_set_irq(pic, e->irqchip.pin, level);
+#else
+	return -1;
+#endif
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int irq_source_id, int level)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
+				   irq_source_id, level);
+
+	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
+}
+
+inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
+{
+#ifdef CONFIG_IA64
+	return irq->delivery_mode ==
+		(IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
+#else
+	return irq->delivery_mode == APIC_DM_LOWEST;
+#endif
+}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq)
+{
+	int i, r = -1;
+	struct kvm_vcpu *vcpu, *lowest = NULL;
+
+	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+			kvm_is_dm_lowest_prio(irq))
+		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!kvm_apic_present(vcpu))
+			continue;
+
+		if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+					irq->dest_id, irq->dest_mode))
+			continue;
+
+		if (!kvm_is_dm_lowest_prio(irq)) {
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu, irq);
+		} else if (kvm_lapic_enabled(vcpu)) {
+			if (!lowest)
+				lowest = vcpu;
+			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+				lowest = vcpu;
+		}
+	}
+
+	if (lowest)
+		r = kvm_apic_set_irq(lowest, irq);
+
+	return r;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		struct kvm *kvm, int irq_source_id, int level)
+{
+	struct kvm_lapic_irq irq;
+
+	if (!level)
+		return -1;
+
+	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+
+	irq.dest_id = (e->msi.address_lo &
+			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+	irq.vector = (e->msi.data &
+			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+	irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+	irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+	irq.delivery_mode = e->msi.data & 0x700;
+	irq.level = 1;
+	irq.shorthand = 0;
+
+	/* TODO Deal with RH bit of MSI message address */
+	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
+}
+
+/*
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
+{
+	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
+	int ret = -1, i = 0;
+	struct kvm_irq_routing_table *irq_rt;
+	struct hlist_node *n;
+
+	trace_kvm_set_irq(irq, level, irq_source_id);
+
+	/* Not possible to detect if the guest uses the PIC or the
+	 * IOAPIC.  So set the bit in both. The guest will ignore
+	 * writes to the unused one.
+	 */
+	rcu_read_lock();
+	irq_rt = rcu_dereference(kvm->irq_routing);
+	if (irq < irq_rt->nr_rt_entries)
+		hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
+			irq_set[i++] = *e;
+	rcu_read_unlock();
+
+	while(i--) {
+		int r;
+		r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
+		if (r < 0)
+			continue;
+
+		ret = r + ((ret < 0) ? 0 : ret);
+	}
+
+	return ret;
+}
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+	struct kvm_irq_ack_notifier *kian;
+	struct hlist_node *n;
+	int gsi;
+
+	trace_kvm_ack_irq(irqchip, pin);
+
+	rcu_read_lock();
+	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	if (gsi != -1)
+		hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
+					 link)
+			if (kian->gsi == gsi)
+				kian->irq_acked(kian);
+	rcu_read_unlock();
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian)
+{
+	mutex_lock(&kvm->irq_lock);
+	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
+	mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				    struct kvm_irq_ack_notifier *kian)
+{
+	mutex_lock(&kvm->irq_lock);
+	hlist_del_init_rcu(&kian->link);
+	mutex_unlock(&kvm->irq_lock);
+	synchronize_rcu();
+}
+
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+	int irq_source_id;
+
+	mutex_lock(&kvm->irq_lock);
+	irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+	if (irq_source_id >= BITS_PER_LONG) {
+		printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+		irq_source_id = -EFAULT;
+		goto unlock;
+	}
+
+	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+	set_bit(irq_source_id, bitmap);
+unlock:
+	mutex_unlock(&kvm->irq_lock);
+
+	return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+	int i;
+
+	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+
+	mutex_lock(&kvm->irq_lock);
+	if (irq_source_id < 0 ||
+	    irq_source_id >= BITS_PER_LONG) {
+		printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+		goto unlock;
+	}
+	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+	if (!irqchip_in_kernel(kvm))
+		goto unlock;
+
+	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
+		clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
+		if (i >= 16)
+			continue;
+#ifdef CONFIG_X86
+		clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
+#endif
+	}
+unlock:
+	mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+				    struct kvm_irq_mask_notifier *kimn)
+{
+	mutex_lock(&kvm->irq_lock);
+	kimn->irq = irq;
+	hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list);
+	mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+				      struct kvm_irq_mask_notifier *kimn)
+{
+	mutex_lock(&kvm->irq_lock);
+	hlist_del_rcu(&kimn->link);
+	mutex_unlock(&kvm->irq_lock);
+	synchronize_rcu();
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+			     bool mask)
+{
+	struct kvm_irq_mask_notifier *kimn;
+	struct hlist_node *n;
+	int gsi;
+
+	rcu_read_lock();
+	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	if (gsi != -1)
+		hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
+			if (kimn->irq == gsi)
+				kimn->func(kimn, mask);
+	rcu_read_unlock();
+}
+
+void kvm_free_irq_routing(struct kvm *kvm)
+{
+	/* Called only during vm destruction. Nobody can use the pointer
+	   at this stage */
+	kfree(kvm->irq_routing);
+}
+
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+			       struct kvm_kernel_irq_routing_entry *e,
+			       const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+	int delta;
+	unsigned max_pin;
+	struct kvm_kernel_irq_routing_entry *ei;
+	struct hlist_node *n;
+
+	/*
+	 * Do not allow GSI to be mapped to the same irqchip more than once.
+	 * Allow only one to one mapping between GSI and MSI.
+	 */
+	hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
+		if (ei->type == KVM_IRQ_ROUTING_MSI ||
+		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+			return r;
+
+	e->gsi = ue->gsi;
+	e->type = ue->type;
+	switch (ue->type) {
+	case KVM_IRQ_ROUTING_IRQCHIP:
+		delta = 0;
+		switch (ue->u.irqchip.irqchip) {
+		case KVM_IRQCHIP_PIC_MASTER:
+			e->set = kvm_set_pic_irq;
+			max_pin = 16;
+			break;
+		case KVM_IRQCHIP_PIC_SLAVE:
+			e->set = kvm_set_pic_irq;
+			max_pin = 16;
+			delta = 8;
+			break;
+		case KVM_IRQCHIP_IOAPIC:
+			max_pin = KVM_IOAPIC_NUM_PINS;
+			e->set = kvm_set_ioapic_irq;
+			break;
+		default:
+			goto out;
+		}
+		e->irqchip.irqchip = ue->u.irqchip.irqchip;
+		e->irqchip.pin = ue->u.irqchip.pin + delta;
+		if (e->irqchip.pin >= max_pin)
+			goto out;
+		rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
+		break;
+	case KVM_IRQ_ROUTING_MSI:
+		e->set = kvm_set_msi;
+		e->msi.address_lo = ue->u.msi.address_lo;
+		e->msi.address_hi = ue->u.msi.address_hi;
+		e->msi.data = ue->u.msi.data;
+		break;
+	default:
+		goto out;
+	}
+
+	hlist_add_head(&e->link, &rt->map[e->gsi]);
+	r = 0;
+out:
+	return r;
+}
+
+
+int kvm_set_irq_routing(struct kvm *kvm,
+			const struct kvm_irq_routing_entry *ue,
+			unsigned nr,
+			unsigned flags)
+{
+	struct kvm_irq_routing_table *new, *old;
+	u32 i, j, nr_rt_entries = 0;
+	int r;
+
+	for (i = 0; i < nr; ++i) {
+		if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+			return -EINVAL;
+		nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+	}
+
+	nr_rt_entries += 1;
+
+	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+		      GFP_KERNEL);
+
+	if (!new)
+		return -ENOMEM;
+
+	new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+	new->nr_rt_entries = nr_rt_entries;
+	for (i = 0; i < 3; i++)
+		for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
+			new->chip[i][j] = -1;
+
+	for (i = 0; i < nr; ++i) {
+		r = -EINVAL;
+		if (ue->flags)
+			goto out;
+		r = setup_routing_entry(new, &new->rt_entries[i], ue);
+		if (r)
+			goto out;
+		++ue;
+	}
+
+	mutex_lock(&kvm->irq_lock);
+	old = kvm->irq_routing;
+	kvm_irq_routing_update(kvm, new);
+	mutex_unlock(&kvm->irq_lock);
+
+	synchronize_rcu();
+
+	new = old;
+	r = 0;
+
+out:
+	kfree(new);
+	return r;
+}
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
+	  .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#ifdef CONFIG_X86
+#  define PIC_ROUTING_ENTRY(irq) \
+	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
+	  .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 }
+#  define ROUTING_ENTRY2(irq) \
+	IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+#else
+#  define ROUTING_ENTRY2(irq) \
+	IOAPIC_ROUTING_ENTRY(irq)
+#endif
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+	ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+	ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+	ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+	ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+	ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+	ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+	ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+	ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+	ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+	ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+	ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+	ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+#ifdef CONFIG_IA64
+	ROUTING_ENTRY1(24), ROUTING_ENTRY1(25),
+	ROUTING_ENTRY1(26), ROUTING_ENTRY1(27),
+	ROUTING_ENTRY1(28), ROUTING_ENTRY1(29),
+	ROUTING_ENTRY1(30), ROUTING_ENTRY1(31),
+	ROUTING_ENTRY1(32), ROUTING_ENTRY1(33),
+	ROUTING_ENTRY1(34), ROUTING_ENTRY1(35),
+	ROUTING_ENTRY1(36), ROUTING_ENTRY1(37),
+	ROUTING_ENTRY1(38), ROUTING_ENTRY1(39),
+	ROUTING_ENTRY1(40), ROUTING_ENTRY1(41),
+	ROUTING_ENTRY1(42), ROUTING_ENTRY1(43),
+	ROUTING_ENTRY1(44), ROUTING_ENTRY1(45),
+	ROUTING_ENTRY1(46), ROUTING_ENTRY1(47),
+#endif
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, default_routing,
+				   ARRAY_SIZE(default_routing), 0);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
new file mode 100644
index 00000000..6b39ba95
--- /dev/null
+++ b/virt/kvm/kvm_main.c
@@ -0,0 +1,2635 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/syscore_ops.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <linux/anon_inodes.h>
+#include <linux/profile.h>
+#include <linux/kvm_para.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/compat.h>
+#include <linux/srcu.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+#include "coalesced_mmio.h"
+#include "async_pf.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/kvm.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+/*
+ * Ordering of locks:
+ *
+ * 		kvm->lock --> kvm->slots_lock --> kvm->irq_lock
+ */
+
+DEFINE_RAW_SPINLOCK(kvm_lock);
+LIST_HEAD(vm_list);
+
+static cpumask_var_t cpus_hardware_enabled;
+static int kvm_usage_count = 0;
+static atomic_t hardware_enable_failed;
+
+struct kmem_cache *kvm_vcpu_cache;
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+
+static __read_mostly struct preempt_ops kvm_preempt_ops;
+
+struct dentry *kvm_debugfs_dir;
+
+static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
+			   unsigned long arg);
+static int hardware_enable_all(void);
+static void hardware_disable_all(void);
+
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+
+bool kvm_rebooting;
+EXPORT_SYMBOL_GPL(kvm_rebooting);
+
+static bool largepages_enabled = true;
+
+static struct page *hwpoison_page;
+static pfn_t hwpoison_pfn;
+
+static struct page *fault_page;
+static pfn_t fault_pfn;
+
+inline int kvm_is_mmio_pfn(pfn_t pfn)
+{
+	if (pfn_valid(pfn)) {
+		int reserved;
+		struct page *tail = pfn_to_page(pfn);
+		struct page *head = compound_trans_head(tail);
+		reserved = PageReserved(head);
+		if (head != tail) {
+			/*
+			 * "head" is not a dangling pointer
+			 * (compound_trans_head takes care of that)
+			 * but the hugepage may have been splitted
+			 * from under us (and we may not hold a
+			 * reference count on the head page so it can
+			 * be reused before we run PageReferenced), so
+			 * we've to check PageTail before returning
+			 * what we just read.
+			 */
+			smp_rmb();
+			if (PageTail(tail))
+				return reserved;
+		}
+		return PageReserved(tail);
+	}
+
+	return true;
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+void vcpu_load(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+
+	mutex_lock(&vcpu->mutex);
+	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
+		/* The thread running this VCPU changed. */
+		struct pid *oldpid = vcpu->pid;
+		struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+		rcu_assign_pointer(vcpu->pid, newpid);
+		synchronize_rcu();
+		put_pid(oldpid);
+	}
+	cpu = get_cpu();
+	preempt_notifier_register(&vcpu->preempt_notifier);
+	kvm_arch_vcpu_load(vcpu, cpu);
+	put_cpu();
+}
+
+void vcpu_put(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	kvm_arch_vcpu_put(vcpu);
+	preempt_notifier_unregister(&vcpu->preempt_notifier);
+	preempt_enable();
+	mutex_unlock(&vcpu->mutex);
+}
+
+static void ack_flush(void *_completed)
+{
+}
+
+static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+{
+	int i, cpu, me;
+	cpumask_var_t cpus;
+	bool called = true;
+	struct kvm_vcpu *vcpu;
+
+	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+	me = get_cpu();
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvm_make_request(req, vcpu);
+		cpu = vcpu->cpu;
+
+		/* Set ->requests bit before we read ->mode */
+		smp_mb();
+
+		if (cpus != NULL && cpu != -1 && cpu != me &&
+		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
+			cpumask_set_cpu(cpu, cpus);
+	}
+	if (unlikely(cpus == NULL))
+		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
+	else if (!cpumask_empty(cpus))
+		smp_call_function_many(cpus, ack_flush, NULL, 1);
+	else
+		called = false;
+	put_cpu();
+	free_cpumask_var(cpus);
+	return called;
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+	int dirty_count = kvm->tlbs_dirty;
+
+	smp_mb();
+	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+		++kvm->stat.remote_tlb_flush;
+	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+}
+
+void kvm_reload_remote_mmus(struct kvm *kvm)
+{
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+}
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
+	struct page *page;
+	int r;
+
+	mutex_init(&vcpu->mutex);
+	vcpu->cpu = -1;
+	vcpu->kvm = kvm;
+	vcpu->vcpu_id = id;
+	vcpu->pid = NULL;
+	init_waitqueue_head(&vcpu->wq);
+	kvm_async_pf_vcpu_init(vcpu);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto fail;
+	}
+	vcpu->run = page_address(page);
+
+	r = kvm_arch_vcpu_init(vcpu);
+	if (r < 0)
+		goto fail_free_run;
+	return 0;
+
+fail_free_run:
+	free_page((unsigned long)vcpu->run);
+fail:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
+
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	put_pid(vcpu->pid);
+	kvm_arch_vcpu_uninit(vcpu);
+	free_page((unsigned long)vcpu->run);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+	return container_of(mn, struct kvm, mmu_notifier);
+}
+
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+					     struct mm_struct *mm,
+					     unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush, idx;
+
+	/*
+	 * When ->invalidate_page runs, the linux pte has been zapped
+	 * already but the page is still allocated until
+	 * ->invalidate_page returns. So if we increase the sequence
+	 * here the kvm page fault will notice if the spte can't be
+	 * established because the page is going to be freed. If
+	 * instead the kvm page fault establishes the spte before
+	 * ->invalidate_page runs, kvm_unmap_hva will release it
+	 * before returning.
+	 *
+	 * The sequence increase only need to be seen at spin_unlock
+	 * time, and not at spin_lock time.
+	 *
+	 * Increasing the sequence after the spin_unlock would be
+	 * unsafe because the kvm page fault could then establish the
+	 * pte after kvm_unmap_hva returned, without noticing the page
+	 * is going to be freed.
+	 */
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	kvm->mmu_notifier_seq++;
+	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+
+}
+
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address,
+					pte_t pte)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	kvm->mmu_notifier_seq++;
+	kvm_set_spte_hva(kvm, address, pte);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						    struct mm_struct *mm,
+						    unsigned long start,
+						    unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush = 0, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * The count increase must become visible at unlock time as no
+	 * spte can be established without taking the mmu_lock and
+	 * count is also read inside the mmu_lock critical section.
+	 */
+	kvm->mmu_notifier_count++;
+	for (; start < end; start += PAGE_SIZE)
+		need_tlb_flush |= kvm_unmap_hva(kvm, start);
+	need_tlb_flush |= kvm->tlbs_dirty;
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * This sequence increase will notify the kvm page fault that
+	 * the page that is going to be mapped in the spte could have
+	 * been freed.
+	 */
+	kvm->mmu_notifier_seq++;
+	/*
+	 * The above sequence increase must be visible before the
+	 * below count decrease but both values are read by the kvm
+	 * page fault under mmu_lock spinlock so we don't need to add
+	 * a smb_wmb() here in between the two.
+	 */
+	kvm->mmu_notifier_count--;
+	spin_unlock(&kvm->mmu_lock);
+
+	BUG_ON(kvm->mmu_notifier_count < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	young = kvm_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
+}
+
+static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	young = kvm_test_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return young;
+}
+
+static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
+				     struct mm_struct *mm)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	kvm_arch_flush_shadow(kvm);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.test_young		= kvm_mmu_notifier_test_young,
+	.change_pte		= kvm_mmu_notifier_change_pte,
+	.release		= kvm_mmu_notifier_release,
+};
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
+	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+}
+
+#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+	return 0;
+}
+
+#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+
+static struct kvm *kvm_create_vm(void)
+{
+	int r, i;
+	struct kvm *kvm = kvm_arch_alloc_vm();
+
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	r = kvm_arch_init_vm(kvm);
+	if (r)
+		goto out_err_nodisable;
+
+	r = hardware_enable_all();
+	if (r)
+		goto out_err_nodisable;
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+#endif
+
+	r = -ENOMEM;
+	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+	if (!kvm->memslots)
+		goto out_err_nosrcu;
+	if (init_srcu_struct(&kvm->srcu))
+		goto out_err_nosrcu;
+	for (i = 0; i < KVM_NR_BUSES; i++) {
+		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
+					GFP_KERNEL);
+		if (!kvm->buses[i])
+			goto out_err;
+	}
+
+	spin_lock_init(&kvm->mmu_lock);
+	kvm->mm = current->mm;
+	atomic_inc(&kvm->mm->mm_count);
+	kvm_eventfd_init(kvm);
+	mutex_init(&kvm->lock);
+	mutex_init(&kvm->irq_lock);
+	mutex_init(&kvm->slots_lock);
+	atomic_set(&kvm->users_count, 1);
+
+	r = kvm_init_mmu_notifier(kvm);
+	if (r)
+		goto out_err;
+
+	raw_spin_lock(&kvm_lock);
+	list_add(&kvm->vm_list, &vm_list);
+	raw_spin_unlock(&kvm_lock);
+
+	return kvm;
+
+out_err:
+	cleanup_srcu_struct(&kvm->srcu);
+out_err_nosrcu:
+	hardware_disable_all();
+out_err_nodisable:
+	for (i = 0; i < KVM_NR_BUSES; i++)
+		kfree(kvm->buses[i]);
+	kfree(kvm->memslots);
+	kvm_arch_free_vm(kvm);
+	return ERR_PTR(r);
+}
+
+static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+	if (!memslot->dirty_bitmap)
+		return;
+
+	if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
+		vfree(memslot->dirty_bitmap_head);
+	else
+		kfree(memslot->dirty_bitmap_head);
+
+	memslot->dirty_bitmap = NULL;
+	memslot->dirty_bitmap_head = NULL;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+				  struct kvm_memory_slot *dont)
+{
+	int i;
+
+	if (!dont || free->rmap != dont->rmap)
+		vfree(free->rmap);
+
+	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+		kvm_destroy_dirty_bitmap(free);
+
+
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
+			vfree(free->lpage_info[i]);
+			free->lpage_info[i] = NULL;
+		}
+	}
+
+	free->npages = 0;
+	free->rmap = NULL;
+}
+
+void kvm_free_physmem(struct kvm *kvm)
+{
+	int i;
+	struct kvm_memslots *slots = kvm->memslots;
+
+	for (i = 0; i < slots->nmemslots; ++i)
+		kvm_free_physmem_slot(&slots->memslots[i], NULL);
+
+	kfree(kvm->memslots);
+}
+
+static void kvm_destroy_vm(struct kvm *kvm)
+{
+	int i;
+	struct mm_struct *mm = kvm->mm;
+
+	kvm_arch_sync_events(kvm);
+	raw_spin_lock(&kvm_lock);
+	list_del(&kvm->vm_list);
+	raw_spin_unlock(&kvm_lock);
+	kvm_free_irq_routing(kvm);
+	for (i = 0; i < KVM_NR_BUSES; i++)
+		kvm_io_bus_destroy(kvm->buses[i]);
+	kvm_coalesced_mmio_free(kvm);
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+#else
+	kvm_arch_flush_shadow(kvm);
+#endif
+	kvm_arch_destroy_vm(kvm);
+	kvm_free_physmem(kvm);
+	cleanup_srcu_struct(&kvm->srcu);
+	kvm_arch_free_vm(kvm);
+	hardware_disable_all();
+	mmdrop(mm);
+}
+
+void kvm_get_kvm(struct kvm *kvm)
+{
+	atomic_inc(&kvm->users_count);
+}
+EXPORT_SYMBOL_GPL(kvm_get_kvm);
+
+void kvm_put_kvm(struct kvm *kvm)
+{
+	if (atomic_dec_and_test(&kvm->users_count))
+		kvm_destroy_vm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm);
+
+
+static int kvm_vm_release(struct inode *inode, struct file *filp)
+{
+	struct kvm *kvm = filp->private_data;
+
+	kvm_irqfd_release(kvm);
+
+	kvm_put_kvm(kvm);
+	return 0;
+}
+
+#ifndef CONFIG_S390
+/*
+ * Allocation size is twice as large as the actual dirty bitmap size.
+ * This makes it possible to do double buffering: see x86's
+ * kvm_vm_ioctl_get_dirty_log().
+ */
+static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
+
+	if (dirty_bytes > PAGE_SIZE)
+		memslot->dirty_bitmap = vzalloc(dirty_bytes);
+	else
+		memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
+
+	if (!memslot->dirty_bitmap)
+		return -ENOMEM;
+
+	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
+	return 0;
+}
+#endif /* !CONFIG_S390 */
+
+/*
+ * Allocate some memory and give it an address in the guest physical address
+ * space.
+ *
+ * Discontiguous memory is allowed, mostly for framebuffers.
+ *
+ * Must be called holding mmap_sem for write.
+ */
+int __kvm_set_memory_region(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem,
+			    int user_alloc)
+{
+	int r;
+	gfn_t base_gfn;
+	unsigned long npages;
+	unsigned long i;
+	struct kvm_memory_slot *memslot;
+	struct kvm_memory_slot old, new;
+	struct kvm_memslots *slots, *old_memslots;
+
+	r = -EINVAL;
+	/* General sanity checks */
+	if (mem->memory_size & (PAGE_SIZE - 1))
+		goto out;
+	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+		goto out;
+	/* We can read the guest memory with __xxx_user() later on. */
+	if (user_alloc &&
+	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+	     !access_ok(VERIFY_WRITE,
+			(void __user *)(unsigned long)mem->userspace_addr,
+			mem->memory_size)))
+		goto out;
+	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+		goto out;
+	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+		goto out;
+
+	memslot = &kvm->memslots->memslots[mem->slot];
+	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+	npages = mem->memory_size >> PAGE_SHIFT;
+
+	r = -EINVAL;
+	if (npages > KVM_MEM_MAX_NR_PAGES)
+		goto out;
+
+	if (!npages)
+		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+	new = old = *memslot;
+
+	new.id = mem->slot;
+	new.base_gfn = base_gfn;
+	new.npages = npages;
+	new.flags = mem->flags;
+
+	/* Disallow changing a memory slot's size. */
+	r = -EINVAL;
+	if (npages && old.npages && npages != old.npages)
+		goto out_free;
+
+	/* Check for overlaps */
+	r = -EEXIST;
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
+
+		if (s == memslot || !s->npages)
+			continue;
+		if (!((base_gfn + npages <= s->base_gfn) ||
+		      (base_gfn >= s->base_gfn + s->npages)))
+			goto out_free;
+	}
+
+	/* Free page dirty bitmap if unneeded */
+	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
+		new.dirty_bitmap = NULL;
+
+	r = -ENOMEM;
+
+	/* Allocate if a slot is being created */
+#ifndef CONFIG_S390
+	if (npages && !new.rmap) {
+		new.rmap = vzalloc(npages * sizeof(*new.rmap));
+
+		if (!new.rmap)
+			goto out_free;
+
+		new.user_alloc = user_alloc;
+		new.userspace_addr = mem->userspace_addr;
+	}
+	if (!npages)
+		goto skip_lpage;
+
+	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+		unsigned long ugfn;
+		unsigned long j;
+		int lpages;
+		int level = i + 2;
+
+		/* Avoid unused variable warning if no large pages */
+		(void)level;
+
+		if (new.lpage_info[i])
+			continue;
+
+		lpages = 1 + ((base_gfn + npages - 1)
+			     >> KVM_HPAGE_GFN_SHIFT(level));
+		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
+
+		new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
+
+		if (!new.lpage_info[i])
+			goto out_free;
+
+		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+			new.lpage_info[i][0].write_count = 1;
+		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+			new.lpage_info[i][lpages - 1].write_count = 1;
+		ugfn = new.userspace_addr >> PAGE_SHIFT;
+		/*
+		 * If the gfn and userspace address are not aligned wrt each
+		 * other, or if explicitly asked to, disable large page
+		 * support for this slot
+		 */
+		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+		    !largepages_enabled)
+			for (j = 0; j < lpages; ++j)
+				new.lpage_info[i][j].write_count = 1;
+	}
+
+skip_lpage:
+
+	/* Allocate page dirty bitmap if needed */
+	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+		if (kvm_create_dirty_bitmap(&new) < 0)
+			goto out_free;
+		/* destroy any largepage mappings for dirty tracking */
+	}
+#else  /* not defined CONFIG_S390 */
+	new.user_alloc = user_alloc;
+	if (user_alloc)
+		new.userspace_addr = mem->userspace_addr;
+#endif /* not defined CONFIG_S390 */
+
+	if (!npages) {
+		r = -ENOMEM;
+		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+		if (!slots)
+			goto out_free;
+		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+		if (mem->slot >= slots->nmemslots)
+			slots->nmemslots = mem->slot + 1;
+		slots->generation++;
+		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+
+		old_memslots = kvm->memslots;
+		rcu_assign_pointer(kvm->memslots, slots);
+		synchronize_srcu_expedited(&kvm->srcu);
+		/* From this point no new shadow pages pointing to a deleted
+		 * memslot will be created.
+		 *
+		 * validation of sp->gfn happens in:
+		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+		 * 	- kvm_is_visible_gfn (mmu_check_roots)
+		 */
+		kvm_arch_flush_shadow(kvm);
+		kfree(old_memslots);
+	}
+
+	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
+	if (r)
+		goto out_free;
+
+	/* map/unmap the pages in iommu page table */
+	if (npages) {
+		r = kvm_iommu_map_pages(kvm, &new);
+		if (r)
+			goto out_free;
+	} else
+		kvm_iommu_unmap_pages(kvm, &old);
+
+	r = -ENOMEM;
+	slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+	if (!slots)
+		goto out_free;
+	memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+	if (mem->slot >= slots->nmemslots)
+		slots->nmemslots = mem->slot + 1;
+	slots->generation++;
+
+	/* actual memory is freed via old in kvm_free_physmem_slot below */
+	if (!npages) {
+		new.rmap = NULL;
+		new.dirty_bitmap = NULL;
+		for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
+			new.lpage_info[i] = NULL;
+	}
+
+	slots->memslots[mem->slot] = new;
+	old_memslots = kvm->memslots;
+	rcu_assign_pointer(kvm->memslots, slots);
+	synchronize_srcu_expedited(&kvm->srcu);
+
+	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
+
+	kvm_free_physmem_slot(&old, &new);
+	kfree(old_memslots);
+
+	return 0;
+
+out_free:
+	kvm_free_physmem_slot(&new, &old);
+out:
+	return r;
+
+}
+EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
+
+int kvm_set_memory_region(struct kvm *kvm,
+			  struct kvm_userspace_memory_region *mem,
+			  int user_alloc)
+{
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+	r = __kvm_set_memory_region(kvm, mem, user_alloc);
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_set_memory_region);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+				   struct
+				   kvm_userspace_memory_region *mem,
+				   int user_alloc)
+{
+	if (mem->slot >= KVM_MEMORY_SLOTS)
+		return -EINVAL;
+	return kvm_set_memory_region(kvm, mem, user_alloc);
+}
+
+int kvm_get_dirty_log(struct kvm *kvm,
+			struct kvm_dirty_log *log, int *is_dirty)
+{
+	struct kvm_memory_slot *memslot;
+	int r, i;
+	unsigned long n;
+	unsigned long any = 0;
+
+	r = -EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
+		goto out;
+
+	memslot = &kvm->memslots->memslots[log->slot];
+	r = -ENOENT;
+	if (!memslot->dirty_bitmap)
+		goto out;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
+
+	for (i = 0; !any && i < n/sizeof(long); ++i)
+		any = memslot->dirty_bitmap[i];
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+		goto out;
+
+	if (any)
+		*is_dirty = 1;
+
+	r = 0;
+out:
+	return r;
+}
+
+void kvm_disable_largepages(void)
+{
+	largepages_enabled = false;
+}
+EXPORT_SYMBOL_GPL(kvm_disable_largepages);
+
+int is_error_page(struct page *page)
+{
+	return page == bad_page || page == hwpoison_page || page == fault_page;
+}
+EXPORT_SYMBOL_GPL(is_error_page);
+
+int is_error_pfn(pfn_t pfn)
+{
+	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_error_pfn);
+
+int is_hwpoison_pfn(pfn_t pfn)
+{
+	return pfn == hwpoison_pfn;
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
+
+int is_fault_pfn(pfn_t pfn)
+{
+	return pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_fault_pfn);
+
+static inline unsigned long bad_hva(void)
+{
+	return PAGE_OFFSET;
+}
+
+int kvm_is_error_hva(unsigned long addr)
+{
+	return addr == bad_hva();
+}
+EXPORT_SYMBOL_GPL(kvm_is_error_hva);
+
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
+						gfn_t gfn)
+{
+	int i;
+
+	for (i = 0; i < slots->nmemslots; ++i) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return memslot;
+	}
+	return NULL;
+}
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
+}
+EXPORT_SYMBOL_GPL(gfn_to_memslot);
+
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
+{
+	int i;
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+
+	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+		struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+		if (memslot->flags & KVM_MEMSLOT_INVALID)
+			continue;
+
+		if (gfn >= memslot->base_gfn
+		    && gfn < memslot->base_gfn + memslot->npages)
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+{
+	struct vm_area_struct *vma;
+	unsigned long addr, size;
+
+	size = PAGE_SIZE;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return PAGE_SIZE;
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, addr);
+	if (!vma)
+		goto out;
+
+	size = vma_kernel_pagesize(vma);
+
+out:
+	up_read(&current->mm->mmap_sem);
+
+	return size;
+}
+
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+				     gfn_t *nr_pages)
+{
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return bad_hva();
+
+	if (nr_pages)
+		*nr_pages = slot->npages - (gfn - slot->base_gfn);
+
+	return gfn_to_hva_memslot(slot, gfn);
+}
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_hva);
+
+static pfn_t get_fault_pfn(void)
+{
+	get_page(fault_page);
+	return fault_pfn;
+}
+
+int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
+	unsigned long start, int write, struct page **page)
+{
+	int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
+
+	if (write)
+		flags |= FOLL_WRITE;
+
+	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
+}
+
+static inline int check_user_page_hwpoison(unsigned long addr)
+{
+	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
+
+	rc = __get_user_pages(current, current->mm, addr, 1,
+			      flags, NULL, NULL, NULL);
+	return rc == -EHWPOISON;
+}
+
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
+			bool *async, bool write_fault, bool *writable)
+{
+	struct page *page[1];
+	int npages = 0;
+	pfn_t pfn;
+
+	/* we can do it either atomically or asynchronously, not both */
+	BUG_ON(atomic && async);
+
+	BUG_ON(!write_fault && !writable);
+
+	if (writable)
+		*writable = true;
+
+	if (atomic || async)
+		npages = __get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1) && !atomic) {
+		might_sleep();
+
+		if (writable)
+			*writable = write_fault;
+
+		if (async) {
+			down_read(&current->mm->mmap_sem);
+			npages = get_user_page_nowait(current, current->mm,
+						     addr, write_fault, page);
+			up_read(&current->mm->mmap_sem);
+		} else
+			npages = get_user_pages_fast(addr, 1, write_fault,
+						     page);
+
+		/* map read fault as writable if possible */
+		if (unlikely(!write_fault) && npages == 1) {
+			struct page *wpage[1];
+
+			npages = __get_user_pages_fast(addr, 1, 1, wpage);
+			if (npages == 1) {
+				*writable = true;
+				put_page(page[0]);
+				page[0] = wpage[0];
+			}
+			npages = 1;
+		}
+	}
+
+	if (unlikely(npages != 1)) {
+		struct vm_area_struct *vma;
+
+		if (atomic)
+			return get_fault_pfn();
+
+		down_read(&current->mm->mmap_sem);
+		if (npages == -EHWPOISON ||
+			(!async && check_user_page_hwpoison(addr))) {
+			up_read(&current->mm->mmap_sem);
+			get_page(hwpoison_page);
+			return page_to_pfn(hwpoison_page);
+		}
+
+		vma = find_vma_intersection(current->mm, addr, addr+1);
+
+		if (vma == NULL)
+			pfn = get_fault_pfn();
+		else if ((vma->vm_flags & VM_PFNMAP)) {
+			pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+				vma->vm_pgoff;
+			BUG_ON(!kvm_is_mmio_pfn(pfn));
+		} else {
+			if (async && (vma->vm_flags & VM_WRITE))
+				*async = true;
+			pfn = get_fault_pfn();
+		}
+		up_read(&current->mm->mmap_sem);
+	} else
+		pfn = page_to_pfn(page[0]);
+
+	return pfn;
+}
+
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+{
+	return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
+
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+			  bool write_fault, bool *writable)
+{
+	unsigned long addr;
+
+	if (async)
+		*async = false;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr)) {
+		get_page(bad_page);
+		return page_to_pfn(bad_page);
+	}
+
+	return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+		       bool write_fault, bool *writable)
+{
+	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn);
+
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+		      bool *writable)
+{
+	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
+
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+			 struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
+	return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+}
+
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
+								  int nr_pages)
+{
+	unsigned long addr;
+	gfn_t entry;
+
+	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
+	if (kvm_is_error_hva(addr))
+		return -1;
+
+	if (entry < nr_pages)
+		return 0;
+
+	return __get_user_pages_fast(addr, nr_pages, 1, pages);
+}
+EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+	pfn_t pfn;
+
+	pfn = gfn_to_pfn(kvm, gfn);
+	if (!kvm_is_mmio_pfn(pfn))
+		return pfn_to_page(pfn);
+
+	WARN_ON(kvm_is_mmio_pfn(pfn));
+
+	get_page(bad_page);
+	return bad_page;
+}
+
+EXPORT_SYMBOL_GPL(gfn_to_page);
+
+void kvm_release_page_clean(struct page *page)
+{
+	kvm_release_pfn_clean(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+
+void kvm_release_pfn_clean(pfn_t pfn)
+{
+	if (!kvm_is_mmio_pfn(pfn))
+		put_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
+
+void kvm_release_page_dirty(struct page *page)
+{
+	kvm_release_pfn_dirty(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+
+void kvm_release_pfn_dirty(pfn_t pfn)
+{
+	kvm_set_pfn_dirty(pfn);
+	kvm_release_pfn_clean(pfn);
+}
+EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
+
+void kvm_set_page_dirty(struct page *page)
+{
+	kvm_set_pfn_dirty(page_to_pfn(page));
+}
+EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
+
+void kvm_set_pfn_dirty(pfn_t pfn)
+{
+	if (!kvm_is_mmio_pfn(pfn)) {
+		struct page *page = pfn_to_page(pfn);
+		if (!PageReserved(page))
+			SetPageDirty(page);
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
+
+void kvm_set_pfn_accessed(pfn_t pfn)
+{
+	if (!kvm_is_mmio_pfn(pfn))
+		mark_page_accessed(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
+
+void kvm_get_pfn(pfn_t pfn)
+{
+	if (!kvm_is_mmio_pfn(pfn))
+		get_page(pfn_to_page(pfn));
+}
+EXPORT_SYMBOL_GPL(kvm_get_pfn);
+
+static int next_segment(unsigned long len, int offset)
+{
+	if (len > PAGE_SIZE - offset)
+		return PAGE_SIZE - offset;
+	else
+		return len;
+}
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+			int len)
+{
+	int r;
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	r = __copy_from_user(data, (void __user *)addr + offset, len);
+	if (r)
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page);
+
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest);
+
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+			  unsigned long len)
+{
+	int r;
+	unsigned long addr;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int offset = offset_in_page(gpa);
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	pagefault_disable();
+	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+	pagefault_enable();
+	if (r)
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+			 int offset, int len)
+{
+	int r;
+	unsigned long addr;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	r = copy_to_user((void __user *)addr + offset, data, len);
+	if (r)
+		return -EFAULT;
+	mark_page_dirty(kvm, gfn);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_page);
+
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+		    unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			      gpa_t gpa)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int offset = offset_in_page(gpa);
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+
+	ghc->gpa = gpa;
+	ghc->generation = slots->generation;
+	ghc->memslot = __gfn_to_memslot(slots, gfn);
+	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
+	if (!kvm_is_error_hva(ghc->hva))
+		ghc->hva += offset;
+	else
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int r;
+
+	if (slots->generation != ghc->generation)
+		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+	if (kvm_is_error_hva(ghc->hva))
+		return -EFAULT;
+
+	r = copy_to_user((void __user *)ghc->hva, data, len);
+	if (r)
+		return -EFAULT;
+	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+{
+	return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
+				    offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
+
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+        while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		++gfn;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest);
+
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     gfn_t gfn)
+{
+	if (memslot && memslot->dirty_bitmap) {
+		unsigned long rel_gfn = gfn - memslot->base_gfn;
+
+		__set_bit_le(rel_gfn, memslot->dirty_bitmap);
+	}
+}
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+	memslot = gfn_to_memslot(kvm, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
+}
+
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+		if (kvm_arch_vcpu_runnable(vcpu)) {
+			kvm_make_request(KVM_REQ_UNHALT, vcpu);
+			break;
+		}
+		if (kvm_cpu_has_pending_timer(vcpu))
+			break;
+		if (signal_pending(current))
+			break;
+
+		schedule();
+	}
+
+	finish_wait(&vcpu->wq, &wait);
+}
+
+void kvm_resched(struct kvm_vcpu *vcpu)
+{
+	if (!need_resched())
+		return;
+	cond_resched();
+}
+EXPORT_SYMBOL_GPL(kvm_resched);
+
+void kvm_vcpu_on_spin(struct kvm_vcpu *me)
+{
+	struct kvm *kvm = me->kvm;
+	struct kvm_vcpu *vcpu;
+	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+	int yielded = 0;
+	int pass;
+	int i;
+
+	/*
+	 * We boost the priority of a VCPU that is runnable but not
+	 * currently running, because it got preempted by something
+	 * else and called schedule in __vcpu_run.  Hopefully that
+	 * VCPU is holding the lock that we need and will release it.
+	 * We approximate round-robin by starting at the last boosted VCPU.
+	 */
+	for (pass = 0; pass < 2 && !yielded; pass++) {
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			struct task_struct *task = NULL;
+			struct pid *pid;
+			if (!pass && i < last_boosted_vcpu) {
+				i = last_boosted_vcpu;
+				continue;
+			} else if (pass && i > last_boosted_vcpu)
+				break;
+			if (vcpu == me)
+				continue;
+			if (waitqueue_active(&vcpu->wq))
+				continue;
+			rcu_read_lock();
+			pid = rcu_dereference(vcpu->pid);
+			if (pid)
+				task = get_pid_task(vcpu->pid, PIDTYPE_PID);
+			rcu_read_unlock();
+			if (!task)
+				continue;
+			if (task->flags & PF_VCPU) {
+				put_task_struct(task);
+				continue;
+			}
+			if (yield_to(task, 1)) {
+				put_task_struct(task);
+				kvm->last_boosted_vcpu = i;
+				yielded = 1;
+				break;
+			}
+			put_task_struct(task);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
+
+static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff == 0)
+		page = virt_to_page(vcpu->run);
+#ifdef CONFIG_X86
+	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
+		page = virt_to_page(vcpu->arch.pio_data);
+#endif
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
+		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
+#endif
+	else
+		return VM_FAULT_SIGBUS;
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_vcpu_vm_ops = {
+	.fault = kvm_vcpu_fault,
+};
+
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vcpu_vm_ops;
+	return 0;
+}
+
+static int kvm_vcpu_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+
+	kvm_put_kvm(vcpu->kvm);
+	return 0;
+}
+
+static struct file_operations kvm_vcpu_fops = {
+	.release        = kvm_vcpu_release,
+	.unlocked_ioctl = kvm_vcpu_ioctl,
+	.compat_ioctl   = kvm_vcpu_ioctl,
+	.mmap           = kvm_vcpu_mmap,
+	.llseek		= noop_llseek,
+};
+
+/*
+ * Allocates an inode for the vcpu.
+ */
+static int create_vcpu_fd(struct kvm_vcpu *vcpu)
+{
+	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
+}
+
+/*
+ * Creates some virtual cpus.  Good luck creating more than one.
+ */
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
+{
+	int r;
+	struct kvm_vcpu *vcpu, *v;
+
+	vcpu = kvm_arch_vcpu_create(kvm, id);
+	if (IS_ERR(vcpu))
+		return PTR_ERR(vcpu);
+
+	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+
+	r = kvm_arch_vcpu_setup(vcpu);
+	if (r)
+		return r;
+
+	mutex_lock(&kvm->lock);
+	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
+		r = -EINVAL;
+		goto vcpu_destroy;
+	}
+
+	kvm_for_each_vcpu(r, v, kvm)
+		if (v->vcpu_id == id) {
+			r = -EEXIST;
+			goto vcpu_destroy;
+		}
+
+	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+
+	/* Now it's all set up, let userspace reach it */
+	kvm_get_kvm(kvm);
+	r = create_vcpu_fd(vcpu);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		goto vcpu_destroy;
+	}
+
+	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+	smp_wmb();
+	atomic_inc(&kvm->online_vcpus);
+
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	if (kvm->bsp_vcpu_id == id)
+		kvm->bsp_vcpu = vcpu;
+#endif
+	mutex_unlock(&kvm->lock);
+	return r;
+
+vcpu_destroy:
+	mutex_unlock(&kvm->lock);
+	kvm_arch_vcpu_destroy(vcpu);
+	return r;
+}
+
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+{
+	if (sigset) {
+		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		vcpu->sigset_active = 1;
+		vcpu->sigset = *sigset;
+	} else
+		vcpu->sigset_active = 0;
+	return 0;
+}
+
+static long kvm_vcpu_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+	struct kvm_fpu *fpu = NULL;
+	struct kvm_sregs *kvm_sregs = NULL;
+
+	if (vcpu->kvm->mm != current->mm)
+		return -EIO;
+
+#if defined(CONFIG_S390) || defined(CONFIG_PPC)
+	/*
+	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
+	 * so vcpu_load() would break it.
+	 */
+	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
+		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+#endif
+
+
+	vcpu_load(vcpu);
+	switch (ioctl) {
+	case KVM_RUN:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
+		break;
+	case KVM_GET_REGS: {
+		struct kvm_regs *kvm_regs;
+
+		r = -ENOMEM;
+		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+		if (!kvm_regs)
+			goto out;
+		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
+		if (r)
+			goto out_free1;
+		r = -EFAULT;
+		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
+			goto out_free1;
+		r = 0;
+out_free1:
+		kfree(kvm_regs);
+		break;
+	}
+	case KVM_SET_REGS: {
+		struct kvm_regs *kvm_regs;
+
+		r = -ENOMEM;
+		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+		if (!kvm_regs)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
+			goto out_free2;
+		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+		if (r)
+			goto out_free2;
+		r = 0;
+out_free2:
+		kfree(kvm_regs);
+		break;
+	}
+	case KVM_GET_SREGS: {
+		kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!kvm_sregs)
+			goto out;
+		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_SREGS: {
+		kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!kvm_sregs)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_GET_MP_STATE: {
+		struct kvm_mp_state mp_state;
+
+		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &mp_state, sizeof mp_state))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_MP_STATE: {
+		struct kvm_mp_state mp_state;
+
+		r = -EFAULT;
+		if (copy_from_user(&mp_state, argp, sizeof mp_state))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_TRANSLATE: {
+		struct kvm_translation tr;
+
+		r = -EFAULT;
+		if (copy_from_user(&tr, argp, sizeof tr))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, &tr, sizeof tr))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_GUEST_DEBUG: {
+		struct kvm_guest_debug dbg;
+
+		r = -EFAULT;
+		if (copy_from_user(&dbg, argp, sizeof dbg))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_SIGNAL_MASK: {
+		struct kvm_signal_mask __user *sigmask_arg = argp;
+		struct kvm_signal_mask kvm_sigmask;
+		sigset_t sigset, *p;
+
+		p = NULL;
+		if (argp) {
+			r = -EFAULT;
+			if (copy_from_user(&kvm_sigmask, argp,
+					   sizeof kvm_sigmask))
+				goto out;
+			r = -EINVAL;
+			if (kvm_sigmask.len != sizeof sigset)
+				goto out;
+			r = -EFAULT;
+			if (copy_from_user(&sigset, sigmask_arg->sigset,
+					   sizeof sigset))
+				goto out;
+			p = &sigset;
+		}
+		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
+		break;
+	}
+	case KVM_GET_FPU: {
+		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!fpu)
+			goto out;
+		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
+		if (r)
+			goto out;
+		r = -EFAULT;
+		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_SET_FPU: {
+		fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!fpu)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
+			goto out;
+		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	default:
+		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+	}
+out:
+	vcpu_put(vcpu);
+	kfree(fpu);
+	kfree(kvm_sregs);
+	return r;
+}
+
+static long kvm_vm_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	if (kvm->mm != current->mm)
+		return -EIO;
+	switch (ioctl) {
+	case KVM_CREATE_VCPU:
+		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+		if (r < 0)
+			goto out;
+		break;
+	case KVM_SET_USER_MEMORY_REGION: {
+		struct kvm_userspace_memory_region kvm_userspace_mem;
+
+		r = -EFAULT;
+		if (copy_from_user(&kvm_userspace_mem, argp,
+						sizeof kvm_userspace_mem))
+			goto out;
+
+		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_GET_DIRTY_LOG: {
+		struct kvm_dirty_log log;
+
+		r = -EFAULT;
+		if (copy_from_user(&log, argp, sizeof log))
+			goto out;
+		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+		if (r)
+			goto out;
+		break;
+	}
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+	case KVM_REGISTER_COALESCED_MMIO: {
+		struct kvm_coalesced_mmio_zone zone;
+		r = -EFAULT;
+		if (copy_from_user(&zone, argp, sizeof zone))
+			goto out;
+		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_UNREGISTER_COALESCED_MMIO: {
+		struct kvm_coalesced_mmio_zone zone;
+		r = -EFAULT;
+		if (copy_from_user(&zone, argp, sizeof zone))
+			goto out;
+		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+#endif
+	case KVM_IRQFD: {
+		struct kvm_irqfd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
+		break;
+	}
+	case KVM_IOEVENTFD: {
+		struct kvm_ioeventfd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_ioeventfd(kvm, &data);
+		break;
+	}
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	case KVM_SET_BOOT_CPU_ID:
+		r = 0;
+		mutex_lock(&kvm->lock);
+		if (atomic_read(&kvm->online_vcpus) != 0)
+			r = -EBUSY;
+		else
+			kvm->bsp_vcpu_id = arg;
+		mutex_unlock(&kvm->lock);
+		break;
+#endif
+	default:
+		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+		if (r == -ENOTTY)
+			r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+	}
+out:
+	return r;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_kvm_dirty_log {
+	__u32 slot;
+	__u32 padding1;
+	union {
+		compat_uptr_t dirty_bitmap; /* one bit per page */
+		__u64 padding2;
+	};
+};
+
+static long kvm_vm_compat_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm = filp->private_data;
+	int r;
+
+	if (kvm->mm != current->mm)
+		return -EIO;
+	switch (ioctl) {
+	case KVM_GET_DIRTY_LOG: {
+		struct compat_kvm_dirty_log compat_log;
+		struct kvm_dirty_log log;
+
+		r = -EFAULT;
+		if (copy_from_user(&compat_log, (void __user *)arg,
+				   sizeof(compat_log)))
+			goto out;
+		log.slot	 = compat_log.slot;
+		log.padding1	 = compat_log.padding1;
+		log.padding2	 = compat_log.padding2;
+		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+
+		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+		if (r)
+			goto out;
+		break;
+	}
+	default:
+		r = kvm_vm_ioctl(filp, ioctl, arg);
+	}
+
+out:
+	return r;
+}
+#endif
+
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page[1];
+	unsigned long addr;
+	int npages;
+	gfn_t gfn = vmf->pgoff;
+	struct kvm *kvm = vma->vm_file->private_data;
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return VM_FAULT_SIGBUS;
+
+	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
+				NULL);
+	if (unlikely(npages != 1))
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = page[0];
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_vm_vm_ops = {
+	.fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_vm_vm_ops;
+	return 0;
+}
+
+static struct file_operations kvm_vm_fops = {
+	.release        = kvm_vm_release,
+	.unlocked_ioctl = kvm_vm_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = kvm_vm_compat_ioctl,
+#endif
+	.mmap           = kvm_vm_mmap,
+	.llseek		= noop_llseek,
+};
+
+static int kvm_dev_ioctl_create_vm(void)
+{
+	int r;
+	struct kvm *kvm;
+
+	kvm = kvm_create_vm();
+	if (IS_ERR(kvm))
+		return PTR_ERR(kvm);
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+	r = kvm_coalesced_mmio_init(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+#endif
+	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+	if (r < 0)
+		kvm_put_kvm(kvm);
+
+	return r;
+}
+
+static long kvm_dev_ioctl_check_extension_generic(long arg)
+{
+	switch (arg) {
+	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+#ifdef CONFIG_KVM_APIC_ARCHITECTURE
+	case KVM_CAP_SET_BOOT_CPU_ID:
+#endif
+	case KVM_CAP_INTERNAL_ERROR_DATA:
+		return 1;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	case KVM_CAP_IRQ_ROUTING:
+		return KVM_MAX_IRQ_ROUTES;
+#endif
+	default:
+		break;
+	}
+	return kvm_dev_ioctl_check_extension(arg);
+}
+
+static long kvm_dev_ioctl(struct file *filp,
+			  unsigned int ioctl, unsigned long arg)
+{
+	long r = -EINVAL;
+
+	switch (ioctl) {
+	case KVM_GET_API_VERSION:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = KVM_API_VERSION;
+		break;
+	case KVM_CREATE_VM:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = kvm_dev_ioctl_create_vm();
+		break;
+	case KVM_CHECK_EXTENSION:
+		r = kvm_dev_ioctl_check_extension_generic(arg);
+		break;
+	case KVM_GET_VCPU_MMAP_SIZE:
+		r = -EINVAL;
+		if (arg)
+			goto out;
+		r = PAGE_SIZE;     /* struct kvm_run */
+#ifdef CONFIG_X86
+		r += PAGE_SIZE;    /* pio data page */
+#endif
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+		r += PAGE_SIZE;    /* coalesced mmio ring page */
+#endif
+		break;
+	case KVM_TRACE_ENABLE:
+	case KVM_TRACE_PAUSE:
+	case KVM_TRACE_DISABLE:
+		r = -EOPNOTSUPP;
+		break;
+	default:
+		return kvm_arch_dev_ioctl(filp, ioctl, arg);
+	}
+out:
+	return r;
+}
+
+static struct file_operations kvm_chardev_ops = {
+	.unlocked_ioctl = kvm_dev_ioctl,
+	.compat_ioctl   = kvm_dev_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice kvm_dev = {
+	KVM_MINOR,
+	"kvm",
+	&kvm_chardev_ops,
+};
+
+static void hardware_enable_nolock(void *junk)
+{
+	int cpu = raw_smp_processor_id();
+	int r;
+
+	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
+		return;
+
+	cpumask_set_cpu(cpu, cpus_hardware_enabled);
+
+	r = kvm_arch_hardware_enable(NULL);
+
+	if (r) {
+		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+		atomic_inc(&hardware_enable_failed);
+		printk(KERN_INFO "kvm: enabling virtualization on "
+				 "CPU%d failed\n", cpu);
+	}
+}
+
+static void hardware_enable(void *junk)
+{
+	raw_spin_lock(&kvm_lock);
+	hardware_enable_nolock(junk);
+	raw_spin_unlock(&kvm_lock);
+}
+
+static void hardware_disable_nolock(void *junk)
+{
+	int cpu = raw_smp_processor_id();
+
+	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
+		return;
+	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+	kvm_arch_hardware_disable(NULL);
+}
+
+static void hardware_disable(void *junk)
+{
+	raw_spin_lock(&kvm_lock);
+	hardware_disable_nolock(junk);
+	raw_spin_unlock(&kvm_lock);
+}
+
+static void hardware_disable_all_nolock(void)
+{
+	BUG_ON(!kvm_usage_count);
+
+	kvm_usage_count--;
+	if (!kvm_usage_count)
+		on_each_cpu(hardware_disable_nolock, NULL, 1);
+}
+
+static void hardware_disable_all(void)
+{
+	raw_spin_lock(&kvm_lock);
+	hardware_disable_all_nolock();
+	raw_spin_unlock(&kvm_lock);
+}
+
+static int hardware_enable_all(void)
+{
+	int r = 0;
+
+	raw_spin_lock(&kvm_lock);
+
+	kvm_usage_count++;
+	if (kvm_usage_count == 1) {
+		atomic_set(&hardware_enable_failed, 0);
+		on_each_cpu(hardware_enable_nolock, NULL, 1);
+
+		if (atomic_read(&hardware_enable_failed)) {
+			hardware_disable_all_nolock();
+			r = -EBUSY;
+		}
+	}
+
+	raw_spin_unlock(&kvm_lock);
+
+	return r;
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+			   void *v)
+{
+	int cpu = (long)v;
+
+	if (!kvm_usage_count)
+		return NOTIFY_OK;
+
+	val &= ~CPU_TASKS_FROZEN;
+	switch (val) {
+	case CPU_DYING:
+		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+		       cpu);
+		hardware_disable(NULL);
+		break;
+	case CPU_STARTING:
+		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+		       cpu);
+		hardware_enable(NULL);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+
+asmlinkage void kvm_spurious_fault(void)
+{
+	/* Fault while not rebooting.  We want the trace. */
+	BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+		      void *v)
+{
+	/*
+	 * Some (well, at least mine) BIOSes hang on reboot if
+	 * in vmx root mode.
+	 *
+	 * And Intel TXT required VMX off for all cpu when system shutdown.
+	 */
+	printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+	kvm_rebooting = true;
+	on_each_cpu(hardware_disable_nolock, NULL, 1);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_reboot_notifier = {
+	.notifier_call = kvm_reboot,
+	.priority = 0,
+};
+
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
+
+		kvm_iodevice_destructor(pos);
+	}
+	kfree(bus);
+}
+
+/* kvm_io_bus_write - called under kvm->slots_lock */
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+		     int len, const void *val)
+{
+	int i;
+	struct kvm_io_bus *bus;
+
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	for (i = 0; i < bus->dev_count; i++)
+		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+			return 0;
+	return -EOPNOTSUPP;
+}
+
+/* kvm_io_bus_read - called under kvm->slots_lock */
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+		    int len, void *val)
+{
+	int i;
+	struct kvm_io_bus *bus;
+
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	for (i = 0; i < bus->dev_count; i++)
+		if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+			return 0;
+	return -EOPNOTSUPP;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			    struct kvm_io_device *dev)
+{
+	struct kvm_io_bus *new_bus, *bus;
+
+	bus = kvm->buses[bus_idx];
+	if (bus->dev_count > NR_IOBUS_DEVS-1)
+		return -ENOSPC;
+
+	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+	if (!new_bus)
+		return -ENOMEM;
+	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+	new_bus->devs[new_bus->dev_count++] = dev;
+	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+	synchronize_srcu_expedited(&kvm->srcu);
+	kfree(bus);
+
+	return 0;
+}
+
+/* Caller must hold slots_lock. */
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			      struct kvm_io_device *dev)
+{
+	int i, r;
+	struct kvm_io_bus *new_bus, *bus;
+
+	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+	if (!new_bus)
+		return -ENOMEM;
+
+	bus = kvm->buses[bus_idx];
+	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+
+	r = -ENOENT;
+	for (i = 0; i < new_bus->dev_count; i++)
+		if (new_bus->devs[i] == dev) {
+			r = 0;
+			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
+			break;
+		}
+
+	if (r) {
+		kfree(new_bus);
+		return r;
+	}
+
+	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+	synchronize_srcu_expedited(&kvm->srcu);
+	kfree(bus);
+	return r;
+}
+
+static struct notifier_block kvm_cpu_notifier = {
+	.notifier_call = kvm_cpu_hotplug,
+};
+
+static int vm_stat_get(void *_offset, u64 *val)
+{
+	unsigned offset = (long)_offset;
+	struct kvm *kvm;
+
+	*val = 0;
+	raw_spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		*val += *(u32 *)((void *)kvm + offset);
+	raw_spin_unlock(&kvm_lock);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
+
+static int vcpu_stat_get(void *_offset, u64 *val)
+{
+	unsigned offset = (long)_offset;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	*val = 0;
+	raw_spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			*val += *(u32 *)((void *)vcpu + offset);
+
+	raw_spin_unlock(&kvm_lock);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
+
+static const struct file_operations *stat_fops[] = {
+	[KVM_STAT_VCPU] = &vcpu_stat_fops,
+	[KVM_STAT_VM]   = &vm_stat_fops,
+};
+
+static void kvm_init_debug(void)
+{
+	struct kvm_stats_debugfs_item *p;
+
+	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
+	for (p = debugfs_entries; p->name; ++p)
+		p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
+						(void *)(long)p->offset,
+						stat_fops[p->kind]);
+}
+
+static void kvm_exit_debug(void)
+{
+	struct kvm_stats_debugfs_item *p;
+
+	for (p = debugfs_entries; p->name; ++p)
+		debugfs_remove(p->dentry);
+	debugfs_remove(kvm_debugfs_dir);
+}
+
+static int kvm_suspend(void)
+{
+	if (kvm_usage_count)
+		hardware_disable_nolock(NULL);
+	return 0;
+}
+
+static void kvm_resume(void)
+{
+	if (kvm_usage_count) {
+		WARN_ON(raw_spin_is_locked(&kvm_lock));
+		hardware_enable_nolock(NULL);
+	}
+}
+
+static struct syscore_ops kvm_syscore_ops = {
+	.suspend = kvm_suspend,
+	.resume = kvm_resume,
+};
+
+struct page *bad_page;
+pfn_t bad_pfn;
+
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+{
+	return container_of(pn, struct kvm_vcpu, preempt_notifier);
+}
+
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_vcpu_load(vcpu, cpu);
+}
+
+static void kvm_sched_out(struct preempt_notifier *pn,
+			  struct task_struct *next)
+{
+	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+	kvm_arch_vcpu_put(vcpu);
+}
+
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+		  struct module *module)
+{
+	int r;
+	int cpu;
+
+	r = kvm_arch_init(opaque);
+	if (r)
+		goto out_fail;
+
+	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (bad_page == NULL) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	bad_pfn = page_to_pfn(bad_page);
+
+	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (hwpoison_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	hwpoison_pfn = page_to_pfn(hwpoison_page);
+
+	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (fault_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	fault_pfn = page_to_pfn(fault_page);
+
+	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	r = kvm_arch_hardware_setup();
+	if (r < 0)
+		goto out_free_0a;
+
+	for_each_online_cpu(cpu) {
+		smp_call_function_single(cpu,
+				kvm_arch_check_processor_compat,
+				&r, 1);
+		if (r < 0)
+			goto out_free_1;
+	}
+
+	r = register_cpu_notifier(&kvm_cpu_notifier);
+	if (r)
+		goto out_free_2;
+	register_reboot_notifier(&kvm_reboot_notifier);
+
+	/* A kmem cache lets us meet the alignment requirements of fx_save. */
+	if (!vcpu_align)
+		vcpu_align = __alignof__(struct kvm_vcpu);
+	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
+					   0, NULL);
+	if (!kvm_vcpu_cache) {
+		r = -ENOMEM;
+		goto out_free_3;
+	}
+
+	r = kvm_async_pf_init();
+	if (r)
+		goto out_free;
+
+	kvm_chardev_ops.owner = module;
+	kvm_vm_fops.owner = module;
+	kvm_vcpu_fops.owner = module;
+
+	r = misc_register(&kvm_dev);
+	if (r) {
+		printk(KERN_ERR "kvm: misc device register failed\n");
+		goto out_unreg;
+	}
+
+	register_syscore_ops(&kvm_syscore_ops);
+
+	kvm_preempt_ops.sched_in = kvm_sched_in;
+	kvm_preempt_ops.sched_out = kvm_sched_out;
+
+	kvm_init_debug();
+
+	return 0;
+
+out_unreg:
+	kvm_async_pf_deinit();
+out_free:
+	kmem_cache_destroy(kvm_vcpu_cache);
+out_free_3:
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_2:
+out_free_1:
+	kvm_arch_hardware_unsetup();
+out_free_0a:
+	free_cpumask_var(cpus_hardware_enabled);
+out_free_0:
+	if (fault_page)
+		__free_page(fault_page);
+	if (hwpoison_page)
+		__free_page(hwpoison_page);
+	__free_page(bad_page);
+out:
+	kvm_arch_exit();
+out_fail:
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init);
+
+void kvm_exit(void)
+{
+	kvm_exit_debug();
+	misc_deregister(&kvm_dev);
+	kmem_cache_destroy(kvm_vcpu_cache);
+	kvm_async_pf_deinit();
+	unregister_syscore_ops(&kvm_syscore_ops);
+	unregister_reboot_notifier(&kvm_reboot_notifier);
+	unregister_cpu_notifier(&kvm_cpu_notifier);
+	on_each_cpu(hardware_disable_nolock, NULL, 1);
+	kvm_arch_hardware_unsetup();
+	kvm_arch_exit();
+	free_cpumask_var(cpus_hardware_enabled);
+	__free_page(hwpoison_page);
+	__free_page(bad_page);
+}
+EXPORT_SYMBOL_GPL(kvm_exit);
author	root <root@artemis.panaceas.org>	2015-12-25 04:40:36 +0000
committer	root <root@artemis.panaceas.org>	2015-12-25 04:40:36 +0000
commit	849369d6c66d3054688672f97d31fceb8e8230fb (patch)
tree	6135abc790ca67dedbe07c39806591e70eda81ce /virt
download	linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.gz linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.bz2 linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.zip