From 849369d6c66d3054688672f97d31fceb8e8230fb Mon Sep 17 00:00:00 2001 From: root Date: Fri, 25 Dec 2015 04:40:36 +0000 Subject: initial_commit --- virt/kvm/Kconfig | 20 + virt/kvm/assigned-dev.c | 797 ++++++++++++++ virt/kvm/async_pf.c | 216 ++++ virt/kvm/async_pf.h | 36 + virt/kvm/coalesced_mmio.c | 191 ++++ virt/kvm/coalesced_mmio.h | 39 + virt/kvm/eventfd.c | 656 +++++++++++ virt/kvm/ioapic.c | 441 ++++++++ virt/kvm/ioapic.h | 83 ++ virt/kvm/iodev.h | 70 ++ virt/kvm/iommu.c | 320 ++++++ virt/kvm/irq_comm.c | 474 ++++++++ virt/kvm/kvm_main.c | 2635 +++++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 5978 insertions(+) create mode 100644 virt/kvm/Kconfig create mode 100644 virt/kvm/assigned-dev.c create mode 100644 virt/kvm/async_pf.c create mode 100644 virt/kvm/async_pf.h create mode 100644 virt/kvm/coalesced_mmio.c create mode 100644 virt/kvm/coalesced_mmio.h create mode 100644 virt/kvm/eventfd.c create mode 100644 virt/kvm/ioapic.c create mode 100644 virt/kvm/ioapic.h create mode 100644 virt/kvm/iodev.h create mode 100644 virt/kvm/iommu.c create mode 100644 virt/kvm/irq_comm.c create mode 100644 virt/kvm/kvm_main.c (limited to 'virt') diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig new file mode 100644 index 00000000..f63ccb0a --- /dev/null +++ b/virt/kvm/Kconfig @@ -0,0 +1,20 @@ +# KVM common configuration items and defaults + +config HAVE_KVM + bool + +config HAVE_KVM_IRQCHIP + bool + +config HAVE_KVM_EVENTFD + bool + select EVENTFD + +config KVM_APIC_ARCHITECTURE + bool + +config KVM_MMIO + bool + +config KVM_ASYNC_PF + bool diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c new file mode 100644 index 00000000..6cc4b97e --- /dev/null +++ b/virt/kvm/assigned-dev.c @@ -0,0 +1,797 @@ +/* + * Kernel-based Virtual Machine - device assignment support + * + * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "irq.h" + +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static int find_index_from_host_irq(struct kvm_assigned_dev_kernel + *assigned_dev, int irq) +{ + int i, index; + struct msix_entry *host_msix_entries; + + host_msix_entries = assigned_dev->host_msix_entries; + + index = -1; + for (i = 0; i < assigned_dev->entries_nr; i++) + if (irq == host_msix_entries[i].vector) { + index = i; + break; + } + if (index < 0) { + printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); + return 0; + } + + return index; +} + +static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + u32 vector; + int index; + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { + spin_lock(&assigned_dev->intx_lock); + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + spin_unlock(&assigned_dev->intx_lock); + } + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + index = find_index_from_host_irq(assigned_dev, irq); + if (index >= 0) { + vector = assigned_dev-> + guest_msix_entries[index].vector; + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, vector, 1); + } + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); + + /* The guest irq may be shared so this ack may be + * from another device. + */ + spin_lock(&dev->intx_lock); + if (dev->host_irq_disabled) { + enable_irq(dev->host_irq); + dev->host_irq_disabled = false; + } + spin_unlock(&dev->intx_lock); +} + +static void deassign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + assigned_dev->ack_notifier.gsi = -1; + + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 0); + + if (assigned_dev->irq_source_id != -1) + kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); + assigned_dev->irq_source_id = -1; + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); +} + +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ +static void deassign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + /* + * We disable irq here to prevent further events. + * + * Notice this maybe result in nested disable if the interrupt type is + * INTx, but it's OK for we are going to free it. + * + * If this function is a part of VM destroy, please ensure that till + * now, the kvm state is still legal for probably we also have to wait + * on a currently running IRQ handler. + */ + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int i; + for (i = 0; i < assigned_dev->entries_nr; i++) + disable_irq(assigned_dev->host_msix_entries[i].vector); + + for (i = 0; i < assigned_dev->entries_nr; i++) + free_irq(assigned_dev->host_msix_entries[i].vector, + (void *)assigned_dev); + + assigned_dev->entries_nr = 0; + kfree(assigned_dev->host_msix_entries); + kfree(assigned_dev->guest_msix_entries); + pci_disable_msix(assigned_dev->dev); + } else { + /* Deal with MSI and INTx */ + disable_irq(assigned_dev->host_irq); + + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) + pci_disable_msi(assigned_dev->dev); + } + + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); +} + +static int kvm_deassign_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev, + unsigned long irq_requested_type) +{ + unsigned long guest_irq_type, host_irq_type; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + /* no irq assignment to deassign */ + if (!assigned_dev->irq_requested_type) + return -ENXIO; + + host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; + guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; + + if (host_irq_type) + deassign_host_irq(kvm, assigned_dev); + if (guest_irq_type) + deassign_guest_irq(kvm, assigned_dev); + + return 0; +} + +static void kvm_free_assigned_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + kvm_free_assigned_irq(kvm, assigned_dev); + + pci_reset_function(assigned_dev->dev); + if (pci_load_and_free_saved_state(assigned_dev->dev, + &assigned_dev->pci_saved_state)) + printk(KERN_INFO "%s: Couldn't reload %s saved state\n", + __func__, dev_name(&assigned_dev->dev->dev)); + else + pci_restore_state(assigned_dev->dev); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int assigned_device_enable_host_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + dev->host_irq = dev->dev->irq; + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + IRQF_ONESHOT, dev->irq_name, (void *)dev)) + return -EIO; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_host_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int r; + + if (!dev->dev->msi_enabled) { + r = pci_enable_msi(dev->dev); + if (r) + return r; + } + + dev->host_irq = dev->dev->irq; + if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, + 0, dev->irq_name, (void *)dev)) { + pci_disable_msi(dev->dev); + return -EIO; + } + + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_host_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int i, r = -EINVAL; + + /* host_msix_entries and guest_msix_entries should have been + * initialized */ + if (dev->entries_nr == 0) + return r; + + r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); + if (r) + return r; + + for (i = 0; i < dev->entries_nr; i++) { + r = request_threaded_irq(dev->host_msix_entries[i].vector, + NULL, kvm_assigned_dev_thread, + 0, dev->irq_name, (void *)dev); + if (r) + goto err; + } + + return 0; +err: + for (i -= 1; i >= 0; i--) + free_irq(dev->host_msix_entries[i].vector, (void *)dev); + pci_disable_msix(dev->dev); + return r; +} + +#endif + +static int assigned_device_enable_guest_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = irq->guest_irq; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_guest_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_guest_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; + return 0; +} +#endif + +static int assign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + __u32 host_irq_type) +{ + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) + return r; + + snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", + pci_name(dev->dev)); + + switch (host_irq_type) { + case KVM_DEV_IRQ_HOST_INTX: + r = assigned_device_enable_host_intx(kvm, dev); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_HOST_MSI: + r = assigned_device_enable_host_msi(kvm, dev); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_HOST_MSIX: + r = assigned_device_enable_host_msix(kvm, dev); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) + dev->irq_requested_type |= host_irq_type; + + return r; +} + +static int assign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq, + unsigned long guest_irq_type) +{ + int id; + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) + return r; + + id = kvm_request_irq_source_id(kvm); + if (id < 0) + return id; + + dev->irq_source_id = id; + + switch (guest_irq_type) { + case KVM_DEV_IRQ_GUEST_INTX: + r = assigned_device_enable_guest_intx(kvm, dev, irq); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_GUEST_MSI: + r = assigned_device_enable_guest_msi(kvm, dev, irq); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_GUEST_MSIX: + r = assigned_device_enable_guest_msix(kvm, dev, irq); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) { + dev->irq_requested_type |= guest_irq_type; + kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); + } else + kvm_free_irq_source_id(kvm, dev->irq_source_id); + + return r; +} + +/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq *assigned_irq) +{ + int r = -EINVAL; + struct kvm_assigned_dev_kernel *match; + unsigned long host_irq_type, guest_irq_type; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(&kvm->lock); + r = -ENODEV; + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); + guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); + + r = -EINVAL; + /* can only assign one type at a time */ + if (hweight_long(host_irq_type) > 1) + goto out; + if (hweight_long(guest_irq_type) > 1) + goto out; + if (host_irq_type == 0 && guest_irq_type == 0) + goto out; + + r = 0; + if (host_irq_type) + r = assign_host_irq(kvm, match, host_irq_type); + if (r) + goto out; + + if (guest_irq_type) + r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = -ENODEV; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + r = kvm_deassign_irq(kvm, match, assigned_irq->flags); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0, idx; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + idx = srcu_read_lock(&kvm->srcu); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EEXIST; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, + assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + + pci_reset_function(dev); + pci_save_state(dev); + match->pci_saved_state = pci_store_saved_state(dev); + if (!match->pci_saved_state) + printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", + __func__, dev_name(&dev->dev)); + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_segnr = assigned_dev->segnr; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->flags = assigned_dev->flags; + match->dev = dev; + spin_lock_init(&match->intx_lock); + match->irq_source_id = -1; + match->kvm = kvm; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); + if (r) + goto out_list_del; + } + r = kvm_assign_device(kvm, match); + if (r) + goto out_list_del; + } + +out: + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +out_list_del: + if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) + printk(KERN_INFO "%s: Couldn't reload %s saved state\n", + __func__, dev_name(&dev->dev)); + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + srcu_read_unlock(&kvm->srcu, idx); + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + printk(KERN_INFO "%s: device hasn't been assigned before, " + "so cannot be deassigned\n", __func__); + r = -EINVAL; + goto out; + } + + if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) + kvm_deassign_device(kvm, match); + + kvm_free_assigned_device(kvm, match); + +out: + mutex_unlock(&kvm->lock); + return r; +} + + +#ifdef __KVM_HAVE_MSIX +static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, + struct kvm_assigned_msix_nr *entry_nr) +{ + int r = 0; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry_nr->assigned_dev_id); + if (!adev) { + r = -EINVAL; + goto msix_nr_out; + } + + if (adev->entries_nr == 0) { + adev->entries_nr = entry_nr->entry_nr; + if (adev->entries_nr == 0 || + adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { + r = -EINVAL; + goto msix_nr_out; + } + + adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * + entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->host_msix_entries) { + r = -ENOMEM; + goto msix_nr_out; + } + adev->guest_msix_entries = + kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->guest_msix_entries) { + kfree(adev->host_msix_entries); + r = -ENOMEM; + goto msix_nr_out; + } + } else /* Not allowed set MSI-X number twice */ + r = -EINVAL; +msix_nr_out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, + struct kvm_assigned_msix_entry *entry) +{ + int r = 0, i; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry->assigned_dev_id); + + if (!adev) { + r = -EINVAL; + goto msix_entry_out; + } + + for (i = 0; i < adev->entries_nr; i++) + if (adev->guest_msix_entries[i].vector == 0 || + adev->guest_msix_entries[i].entry == entry->entry) { + adev->guest_msix_entries[i].entry = entry->entry; + adev->guest_msix_entries[i].vector = entry->gsi; + adev->host_msix_entries[i].entry = entry->entry; + break; + } + if (i == adev->entries_nr) { + r = -ENOSPC; + goto msix_entry_out; + } + +msix_entry_out: + mutex_unlock(&kvm->lock); + + return r; +} +#endif + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + int r; + + switch (ioctl) { + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + r = -EOPNOTSUPP; + break; + } + case KVM_ASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } +#ifdef KVM_CAP_IRQ_ROUTING + case KVM_SET_GSI_ROUTING: { + struct kvm_irq_routing routing; + struct kvm_irq_routing __user *urouting; + struct kvm_irq_routing_entry *entries; + + r = -EFAULT; + if (copy_from_user(&routing, argp, sizeof(routing))) + goto out; + r = -EINVAL; + if (routing.nr >= KVM_MAX_IRQ_ROUTES) + goto out; + if (routing.flags) + goto out; + r = -ENOMEM; + entries = vmalloc(routing.nr * sizeof(*entries)); + if (!entries) + goto out; + r = -EFAULT; + urouting = argp; + if (copy_from_user(entries, urouting->entries, + routing.nr * sizeof(*entries))) + goto out_free_irq_routing; + r = kvm_set_irq_routing(kvm, entries, routing.nr, + routing.flags); + out_free_irq_routing: + vfree(entries); + break; + } +#endif /* KVM_CAP_IRQ_ROUTING */ +#ifdef __KVM_HAVE_MSIX + case KVM_ASSIGN_SET_MSIX_NR: { + struct kvm_assigned_msix_nr entry_nr; + r = -EFAULT; + if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) + goto out; + r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); + if (r) + goto out; + break; + } + case KVM_ASSIGN_SET_MSIX_ENTRY: { + struct kvm_assigned_msix_entry entry; + r = -EFAULT; + if (copy_from_user(&entry, argp, sizeof entry)) + goto out; + r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); + if (r) + goto out; + break; + } +#endif + default: + r = -ENOTTY; + break; + } +out: + return r; +} + diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c new file mode 100644 index 00000000..74268b4c --- /dev/null +++ b/virt/kvm/async_pf.c @@ -0,0 +1,216 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + * Gleb Natapov + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include + +#include "async_pf.h" +#include + +static struct kmem_cache *async_pf_cache; + +int kvm_async_pf_init(void) +{ + async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); + + if (!async_pf_cache) + return -ENOMEM; + + return 0; +} + +void kvm_async_pf_deinit(void) +{ + if (async_pf_cache) + kmem_cache_destroy(async_pf_cache); + async_pf_cache = NULL; +} + +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) +{ + INIT_LIST_HEAD(&vcpu->async_pf.done); + INIT_LIST_HEAD(&vcpu->async_pf.queue); + spin_lock_init(&vcpu->async_pf.lock); +} + +static void async_pf_execute(struct work_struct *work) +{ + struct page *page = NULL; + struct kvm_async_pf *apf = + container_of(work, struct kvm_async_pf, work); + struct mm_struct *mm = apf->mm; + struct kvm_vcpu *vcpu = apf->vcpu; + unsigned long addr = apf->addr; + gva_t gva = apf->gva; + + might_sleep(); + + use_mm(mm); + down_read(&mm->mmap_sem); + get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); + up_read(&mm->mmap_sem); + unuse_mm(mm); + + spin_lock(&vcpu->async_pf.lock); + list_add_tail(&apf->link, &vcpu->async_pf.done); + apf->page = page; + apf->done = true; + spin_unlock(&vcpu->async_pf.lock); + + /* + * apf may be freed by kvm_check_async_pf_completion() after + * this point + */ + + trace_kvm_async_pf_completed(addr, page, gva); + + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + + mmdrop(mm); + kvm_put_kvm(vcpu->kvm); +} + +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) +{ + /* cancel outstanding work queue item */ + while (!list_empty(&vcpu->async_pf.queue)) { + struct kvm_async_pf *work = + list_entry(vcpu->async_pf.queue.next, + typeof(*work), queue); + cancel_work_sync(&work->work); + list_del(&work->queue); + if (!work->done) /* work was canceled */ + kmem_cache_free(async_pf_cache, work); + } + + spin_lock(&vcpu->async_pf.lock); + while (!list_empty(&vcpu->async_pf.done)) { + struct kvm_async_pf *work = + list_entry(vcpu->async_pf.done.next, + typeof(*work), link); + list_del(&work->link); + if (work->page) + put_page(work->page); + kmem_cache_free(async_pf_cache, work); + } + spin_unlock(&vcpu->async_pf.lock); + + vcpu->async_pf.queued = 0; +} + +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) +{ + struct kvm_async_pf *work; + + while (!list_empty_careful(&vcpu->async_pf.done) && + kvm_arch_can_inject_async_page_present(vcpu)) { + spin_lock(&vcpu->async_pf.lock); + work = list_first_entry(&vcpu->async_pf.done, typeof(*work), + link); + list_del(&work->link); + spin_unlock(&vcpu->async_pf.lock); + + if (work->page) + kvm_arch_async_page_ready(vcpu, work); + kvm_arch_async_page_present(vcpu, work); + + list_del(&work->queue); + vcpu->async_pf.queued--; + if (work->page) + put_page(work->page); + kmem_cache_free(async_pf_cache, work); + } +} + +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, + struct kvm_arch_async_pf *arch) +{ + struct kvm_async_pf *work; + + if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) + return 0; + + /* setup delayed work */ + + /* + * do alloc nowait since if we are going to sleep anyway we + * may as well sleep faulting in page + */ + work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT); + if (!work) + return 0; + + work->page = NULL; + work->done = false; + work->vcpu = vcpu; + work->gva = gva; + work->addr = gfn_to_hva(vcpu->kvm, gfn); + work->arch = *arch; + work->mm = current->mm; + atomic_inc(&work->mm->mm_count); + kvm_get_kvm(work->vcpu->kvm); + + /* this can't really happen otherwise gfn_to_pfn_async + would succeed */ + if (unlikely(kvm_is_error_hva(work->addr))) + goto retry_sync; + + INIT_WORK(&work->work, async_pf_execute); + if (!schedule_work(&work->work)) + goto retry_sync; + + list_add_tail(&work->queue, &vcpu->async_pf.queue); + vcpu->async_pf.queued++; + kvm_arch_async_page_not_present(vcpu, work); + return 1; +retry_sync: + kvm_put_kvm(work->vcpu->kvm); + mmdrop(work->mm); + kmem_cache_free(async_pf_cache, work); + return 0; +} + +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) +{ + struct kvm_async_pf *work; + + if (!list_empty_careful(&vcpu->async_pf.done)) + return 0; + + work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); + if (!work) + return -ENOMEM; + + work->page = bad_page; + get_page(bad_page); + INIT_LIST_HEAD(&work->queue); /* for list_del to work */ + + spin_lock(&vcpu->async_pf.lock); + list_add_tail(&work->link, &vcpu->async_pf.done); + spin_unlock(&vcpu->async_pf.lock); + + vcpu->async_pf.queued++; + return 0; +} diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h new file mode 100644 index 00000000..e7ef6447 --- /dev/null +++ b/virt/kvm/async_pf.h @@ -0,0 +1,36 @@ +/* + * kvm asynchronous fault support + * + * Copyright 2010 Red Hat, Inc. + * + * Author: + * Gleb Natapov + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_ASYNC_PF_H__ +#define __KVM_ASYNC_PF_H__ + +#ifdef CONFIG_KVM_ASYNC_PF +int kvm_async_pf_init(void); +void kvm_async_pf_deinit(void); +void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu); +#else +#define kvm_async_pf_init() (0) +#define kvm_async_pf_deinit() do{}while(0) +#define kvm_async_pf_vcpu_init(C) do{}while(0) +#endif + +#endif diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c new file mode 100644 index 00000000..fc848756 --- /dev/null +++ b/virt/kvm/coalesced_mmio.c @@ -0,0 +1,191 @@ +/* + * KVM coalesced MMIO + * + * Copyright (c) 2008 Bull S.A.S. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. + * + * Author: Laurent Vivier + * + */ + +#include "iodev.h" + +#include +#include +#include + +#include "coalesced_mmio.h" + +static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_coalesced_mmio_dev, dev); +} + +static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, + gpa_t addr, int len) +{ + struct kvm_coalesced_mmio_zone *zone; + struct kvm_coalesced_mmio_ring *ring; + unsigned avail; + int i; + + /* Are we able to batch it ? */ + + /* last is the first free entry + * check if we don't meet the first used entry + * there is always one unused entry in the buffer + */ + ring = dev->kvm->coalesced_mmio_ring; + avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; + if (avail < KVM_MAX_VCPUS) { + /* full */ + return 0; + } + + /* is it in a batchable area ? */ + + for (i = 0; i < dev->nb_zones; i++) { + zone = &dev->zone[i]; + + /* (addr,len) is fully included in + * (zone->addr, zone->size) + */ + + if (zone->addr <= addr && + addr + len <= zone->addr + zone->size) + return 1; + } + return 0; +} + +static int coalesced_mmio_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) +{ + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + if (!coalesced_mmio_in_range(dev, addr, len)) + return -EOPNOTSUPP; + + spin_lock(&dev->lock); + + /* copy data in first free entry of the ring */ + + ring->coalesced_mmio[ring->last].phys_addr = addr; + ring->coalesced_mmio[ring->last].len = len; + memcpy(ring->coalesced_mmio[ring->last].data, val, len); + smp_wmb(); + ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; + spin_unlock(&dev->lock); + return 0; +} + +static void coalesced_mmio_destructor(struct kvm_io_device *this) +{ + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + + kfree(dev); +} + +static const struct kvm_io_device_ops coalesced_mmio_ops = { + .write = coalesced_mmio_write, + .destructor = coalesced_mmio_destructor, +}; + +int kvm_coalesced_mmio_init(struct kvm *kvm) +{ + struct kvm_coalesced_mmio_dev *dev; + struct page *page; + int ret; + + ret = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto out_err; + kvm->coalesced_mmio_ring = page_address(page); + + ret = -ENOMEM; + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + if (!dev) + goto out_free_page; + spin_lock_init(&dev->lock); + kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); + dev->kvm = kvm; + kvm->coalesced_mmio_dev = dev; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) + goto out_free_dev; + + return ret; + +out_free_dev: + kvm->coalesced_mmio_dev = NULL; + kfree(dev); +out_free_page: + kvm->coalesced_mmio_ring = NULL; + __free_page(page); +out_err: + return ret; +} + +void kvm_coalesced_mmio_free(struct kvm *kvm) +{ + if (kvm->coalesced_mmio_ring) + free_page((unsigned long)kvm->coalesced_mmio_ring); +} + +int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + + if (dev == NULL) + return -ENXIO; + + mutex_lock(&kvm->slots_lock); + if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { + mutex_unlock(&kvm->slots_lock); + return -ENOBUFS; + } + + dev->zone[dev->nb_zones] = *zone; + dev->nb_zones++; + + mutex_unlock(&kvm->slots_lock); + return 0; +} + +int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + int i; + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + struct kvm_coalesced_mmio_zone *z; + + if (dev == NULL) + return -ENXIO; + + mutex_lock(&kvm->slots_lock); + + i = dev->nb_zones; + while (i) { + z = &dev->zone[i - 1]; + + /* unregister all zones + * included in (zone->addr, zone->size) + */ + + if (zone->addr <= z->addr && + z->addr + z->size <= zone->addr + zone->size) { + dev->nb_zones--; + *z = dev->zone[dev->nb_zones]; + } + i--; + } + + mutex_unlock(&kvm->slots_lock); + + return 0; +} diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h new file mode 100644 index 00000000..8a5959e3 --- /dev/null +++ b/virt/kvm/coalesced_mmio.h @@ -0,0 +1,39 @@ +#ifndef __KVM_COALESCED_MMIO_H__ +#define __KVM_COALESCED_MMIO_H__ + +/* + * KVM coalesced MMIO + * + * Copyright (c) 2008 Bull S.A.S. + * + * Author: Laurent Vivier + * + */ + +#ifdef CONFIG_KVM_MMIO + +#define KVM_COALESCED_MMIO_ZONE_MAX 100 + +struct kvm_coalesced_mmio_dev { + struct kvm_io_device dev; + struct kvm *kvm; + spinlock_t lock; + int nb_zones; + struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; +}; + +int kvm_coalesced_mmio_init(struct kvm *kvm); +void kvm_coalesced_mmio_free(struct kvm *kvm); +int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone); +int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone); + +#else + +static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; } +static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { } + +#endif + +#endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c new file mode 100644 index 00000000..73358d25 --- /dev/null +++ b/virt/kvm/eventfd.c @@ -0,0 +1,656 @@ +/* + * kvm eventfd support - use eventfd objects to signal various KVM events + * + * Copyright 2009 Novell. All Rights Reserved. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Author: + * Gregory Haskins + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iodev.h" + +/* + * -------------------------------------------------------------------- + * irqfd: Allows an fd to be used to inject an interrupt to the guest + * + * Credit goes to Avi Kivity for the original idea. + * -------------------------------------------------------------------- + */ + +struct _irqfd { + /* Used for MSI fast-path */ + struct kvm *kvm; + wait_queue_t wait; + /* Update side is protected by irqfds.lock */ + struct kvm_kernel_irq_routing_entry __rcu *irq_entry; + /* Used for level IRQ fast-path */ + int gsi; + struct work_struct inject; + /* Used for setup/shutdown */ + struct eventfd_ctx *eventfd; + struct list_head list; + poll_table pt; + struct work_struct shutdown; +}; + +static struct workqueue_struct *irqfd_cleanup_wq; + +static void +irqfd_inject(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct kvm *kvm = irqfd->kvm; + + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void +irqfd_shutdown(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + u64 cnt; + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); + + /* + * We know no new events will be scheduled at this point, so block + * until all previously outstanding events have completed + */ + flush_work_sync(&irqfd->inject); + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); +} + + +/* assumes kvm->irqfds.lock is held */ +static bool +irqfd_is_active(struct _irqfd *irqfd) +{ + return list_empty(&irqfd->list) ? false : true; +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes kvm->irqfds.lock is held + */ +static void +irqfd_deactivate(struct _irqfd *irqfd) +{ + BUG_ON(!irqfd_is_active(irqfd)); + + list_del_init(&irqfd->list); + + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int +irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); + unsigned long flags = (unsigned long)key; + struct kvm_kernel_irq_routing_entry *irq; + struct kvm *kvm = irqfd->kvm; + + if (flags & POLLIN) { + rcu_read_lock(); + irq = rcu_dereference(irqfd->irq_entry); + /* An event has been signaled, inject an interrupt */ + if (irq) + kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); + else + schedule_work(&irqfd->inject); + rcu_read_unlock(); + } + + if (flags & POLLHUP) { + /* The eventfd is closing, detach from KVM */ + unsigned long flags; + + spin_lock_irqsave(&kvm->irqfds.lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the irqfds.lock since the item is + * deactivated from the KVM side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (irqfd_is_active(irqfd)) + irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&kvm->irqfds.lock, flags); + } + + return 0; +} + +static void +irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + add_wait_queue(wqh, &irqfd->wait); +} + +/* Must be called under irqfds.lock */ +static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, + struct kvm_irq_routing_table *irq_rt) +{ + struct kvm_kernel_irq_routing_entry *e; + struct hlist_node *n; + + if (irqfd->gsi >= irq_rt->nr_rt_entries) { + rcu_assign_pointer(irqfd->irq_entry, NULL); + return; + } + + hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) { + /* Only fast-path MSI. */ + if (e->type == KVM_IRQ_ROUTING_MSI) + rcu_assign_pointer(irqfd->irq_entry, e); + else + rcu_assign_pointer(irqfd->irq_entry, NULL); + } +} + +static int +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +{ + struct kvm_irq_routing_table *irq_rt; + struct _irqfd *irqfd, *tmp; + struct file *file = NULL; + struct eventfd_ctx *eventfd = NULL; + int ret; + unsigned int events; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->kvm = kvm; + irqfd->gsi = gsi; + INIT_LIST_HEAD(&irqfd->list); + INIT_WORK(&irqfd->inject, irqfd_inject); + INIT_WORK(&irqfd->shutdown, irqfd_shutdown); + + file = eventfd_fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + eventfd = eventfd_ctx_fileget(file); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->eventfd = eventfd; + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); + + spin_lock_irq(&kvm->irqfds.lock); + + ret = 0; + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + /* This fd is used for another irq already. */ + ret = -EBUSY; + spin_unlock_irq(&kvm->irqfds.lock); + goto fail; + } + + irq_rt = rcu_dereference_protected(kvm->irq_routing, + lockdep_is_held(&kvm->irqfds.lock)); + irqfd_update(kvm, irqfd, irq_rt); + + events = file->f_op->poll(file, &irqfd->pt); + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + if (events & POLLIN) + schedule_work(&irqfd->inject); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * do not drop the file until the irqfd is fully initialized, otherwise + * we might race against the POLLHUP + */ + fput(file); + + return 0; + +fail: + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + if (!IS_ERR(file)) + fput(file); + + kfree(irqfd); + return ret; +} + +void +kvm_eventfd_init(struct kvm *kvm) +{ + spin_lock_init(&kvm->irqfds.lock); + INIT_LIST_HEAD(&kvm->irqfds.items); + INIT_LIST_HEAD(&kvm->ioeventfds); +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int +kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd, *tmp; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { + /* + * This rcu_assign_pointer is needed for when + * another thread calls kvm_irq_routing_update before + * we flush workqueue below (we synchronize with + * kvm_irq_routing_update using irqfds.lock). + * It is paired with synchronize_rcu done by caller + * of that function. + */ + rcu_assign_pointer(irqfd->irq_entry, NULL); + irqfd_deactivate(irqfd); + } + } + + spin_unlock_irq(&kvm->irqfds.lock); + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +{ + if (flags & KVM_IRQFD_FLAG_DEASSIGN) + return kvm_irqfd_deassign(kvm, fd, gsi); + + return kvm_irqfd_assign(kvm, fd, gsi); +} + +/* + * This function is called as the kvm VM fd is being released. Shutdown all + * irqfds that still remain open + */ +void +kvm_irqfd_release(struct kvm *kvm) +{ + struct _irqfd *irqfd, *tmp; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) + irqfd_deactivate(irqfd); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a kvm* reference. + */ + flush_workqueue(irqfd_cleanup_wq); + +} + +/* + * Change irq_routing and irqfd. + * Caller must invoke synchronize_rcu afterwards. + */ +void kvm_irq_routing_update(struct kvm *kvm, + struct kvm_irq_routing_table *irq_rt) +{ + struct _irqfd *irqfd; + + spin_lock_irq(&kvm->irqfds.lock); + + rcu_assign_pointer(kvm->irq_routing, irq_rt); + + list_for_each_entry(irqfd, &kvm->irqfds.items, list) + irqfd_update(kvm, irqfd, irq_rt); + + spin_unlock_irq(&kvm->irqfds.lock); +} + +/* + * create a host-wide workqueue for issuing deferred shutdown requests + * aggregated from all vm* instances. We need our own isolated single-thread + * queue to prevent deadlock against flushing the normal work-queue. + */ +static int __init irqfd_module_init(void) +{ + irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void __exit irqfd_module_exit(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +module_init(irqfd_module_init); +module_exit(irqfd_module_exit); + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. + * + * userspace can register a PIO/MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +struct _ioeventfd { + struct list_head list; + u64 addr; + int length; + struct eventfd_ctx *eventfd; + u64 datamatch; + struct kvm_io_device dev; + bool wildcard; +}; + +static inline struct _ioeventfd * +to_ioeventfd(struct kvm_io_device *dev) +{ + return container_of(dev, struct _ioeventfd, dev); +} + +static void +ioeventfd_release(struct _ioeventfd *p) +{ + eventfd_ctx_put(p->eventfd); + list_del(&p->list); + kfree(p); +} + +static bool +ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) +{ + u64 _val; + + if (!(addr == p->addr && len == p->length)) + /* address-range must be precise for a hit */ + return false; + + if (p->wildcard) + /* all else equal, wildcard is always a hit */ + return true; + + /* otherwise, we have to actually compare the data */ + + BUG_ON(!IS_ALIGNED((unsigned long)val, len)); + + switch (len) { + case 1: + _val = *(u8 *)val; + break; + case 2: + _val = *(u16 *)val; + break; + case 4: + _val = *(u32 *)val; + break; + case 8: + _val = *(u64 *)val; + break; + default: + return false; + } + + return _val == p->datamatch ? true : false; +} + +/* MMIO/PIO writes trigger an event if the addr/val match */ +static int +ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + if (!ioeventfd_in_range(p, addr, len, val)) + return -EOPNOTSUPP; + + eventfd_signal(p->eventfd, 1); + return 0; +} + +/* + * This function is called as KVM is completely shutting down. We do not + * need to worry about locking just nuke anything we have as quickly as possible + */ +static void +ioeventfd_destructor(struct kvm_io_device *this) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + ioeventfd_release(p); +} + +static const struct kvm_io_device_ops ioeventfd_ops = { + .write = ioeventfd_write, + .destructor = ioeventfd_destructor, +}; + +/* assumes kvm->slots_lock held */ +static bool +ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) +{ + struct _ioeventfd *_p; + + list_for_each_entry(_p, &kvm->ioeventfds, list) + if (_p->addr == p->addr && _p->length == p->length && + (_p->wildcard || p->wildcard || + _p->datamatch == p->datamatch)) + return true; + + return false; +} + +static int +kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; + struct _ioeventfd *p; + struct eventfd_ctx *eventfd; + int ret; + + /* must be natural-word sized */ + switch (args->len) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&p->list); + p->addr = args->addr; + p->length = args->len; + p->eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) + p->datamatch = args->datamatch; + else + p->wildcard = true; + + mutex_lock(&kvm->slots_lock); + + /* Verify that there isn't a match already */ + if (ioeventfd_check_collision(kvm, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + kvm_iodevice_init(&p->dev, &ioeventfd_ops); + + ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); + if (ret < 0) + goto unlock_fail; + + list_add_tail(&p->list, &kvm->ioeventfds); + + mutex_unlock(&kvm->slots_lock); + + return 0; + +unlock_fail: + mutex_unlock(&kvm->slots_lock); + +fail: + kfree(p); + eventfd_ctx_put(eventfd); + + return ret; +} + +static int +kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; + struct _ioeventfd *p, *tmp; + struct eventfd_ctx *eventfd; + int ret = -ENOENT; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&kvm->slots_lock); + + list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { + bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + + if (p->eventfd != eventfd || + p->addr != args->addr || + p->length != args->len || + p->wildcard != wildcard) + continue; + + if (!p->wildcard && p->datamatch != args->datamatch) + continue; + + kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); + ioeventfd_release(p); + ret = 0; + break; + } + + mutex_unlock(&kvm->slots_lock); + + eventfd_ctx_put(eventfd); + + return ret; +} + +int +kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) + return kvm_deassign_ioeventfd(kvm, args); + + return kvm_assign_ioeventfd(kvm, args); +} diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c new file mode 100644 index 00000000..8df1ca10 --- /dev/null +++ b/virt/kvm/ioapic.c @@ -0,0 +1,441 @@ +/* + * Copyright (C) 2001 MandrakeSoft S.A. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * MandrakeSoft S.A. + * 43, rue d'Aboukir + * 75002 Paris - France + * http://www.linux-mandrake.com/ + * http://www.mandrakesoft.com/ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Yunhong Jiang + * Yaozu (Eddie) Dong + * Based on Xen 3.1 code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ioapic.h" +#include "lapic.h" +#include "irq.h" + +#if 0 +#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#else +#define ioapic_debug(fmt, arg...) +#endif +static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq); + +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, + unsigned long addr, + unsigned long length) +{ + unsigned long result = 0; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) + | (IOAPIC_VERSION_ID & 0xff)); + break; + + case IOAPIC_REG_APIC_ID: + case IOAPIC_REG_ARB_ID: + result = ((ioapic->id & 0xf) << 24); + break; + + default: + { + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; + u64 redir_content; + + ASSERT(redir_index < IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[redir_index].bits; + result = (ioapic->ioregsel & 0x1) ? + (redir_content >> 32) & 0xffffffff : + redir_content & 0xffffffff; + break; + } + } + + return result; +} + +static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) +{ + union kvm_ioapic_redirect_entry *pent; + int injected = -1; + + pent = &ioapic->redirtbl[idx]; + + if (!pent->fields.mask) { + injected = ioapic_deliver(ioapic, idx); + if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) + pent->fields.remote_irr = 1; + } + + return injected; +} + +static void update_handled_vectors(struct kvm_ioapic *ioapic) +{ + DECLARE_BITMAP(handled_vectors, 256); + int i; + + memset(handled_vectors, 0, sizeof(handled_vectors)); + for (i = 0; i < IOAPIC_NUM_PINS; ++i) + __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); + memcpy(ioapic->handled_vectors, handled_vectors, + sizeof(handled_vectors)); + smp_wmb(); +} + +static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) +{ + unsigned index; + bool mask_before, mask_after; + union kvm_ioapic_redirect_entry *e; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + /* Writes are ignored. */ + break; + + case IOAPIC_REG_APIC_ID: + ioapic->id = (val >> 24) & 0xf; + break; + + case IOAPIC_REG_ARB_ID: + break; + + default: + index = (ioapic->ioregsel - 0x10) >> 1; + + ioapic_debug("change redir index %x val %x\n", index, val); + if (index >= IOAPIC_NUM_PINS) + return; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; + if (ioapic->ioregsel & 1) { + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; + } else { + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + e->fields.remote_irr = 0; + } + update_handled_vectors(ioapic); + mask_after = e->fields.mask; + if (mask_before != mask_after) + kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG + && ioapic->irr & (1 << index)) + ioapic_service(ioapic, index); + break; + } +} + +static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) +{ + union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; + struct kvm_lapic_irq irqe; + + ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " + "vector=%x trig_mode=%x\n", + entry->fields.dest_id, entry->fields.dest_mode, + entry->fields.delivery_mode, entry->fields.vector, + entry->fields.trig_mode); + + irqe.dest_id = entry->fields.dest_id; + irqe.vector = entry->fields.vector; + irqe.dest_mode = entry->fields.dest_mode; + irqe.trig_mode = entry->fields.trig_mode; + irqe.delivery_mode = entry->fields.delivery_mode << 8; + irqe.level = 1; + irqe.shorthand = 0; + +#ifdef CONFIG_X86 + /* Always delivery PIT interrupt to vcpu 0 */ + if (irq == 0) { + irqe.dest_mode = 0; /* Physical mode. */ + /* need to read apic_id from apic regiest since + * it can be rewritten */ + irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; + } +#endif + return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); +} + +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +{ + u32 old_irr; + u32 mask = 1 << irq; + union kvm_ioapic_redirect_entry entry; + int ret = 1; + + spin_lock(&ioapic->lock); + old_irr = ioapic->irr; + if (irq >= 0 && irq < IOAPIC_NUM_PINS) { + entry = ioapic->redirtbl[irq]; + level ^= entry.fields.polarity; + if (!level) + ioapic->irr &= ~mask; + else { + int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + ioapic->irr |= mask; + if ((edge && old_irr != ioapic->irr) || + (!edge && !entry.fields.remote_irr)) + ret = ioapic_service(ioapic, irq); + else + ret = 0; /* report coalesced interrupt */ + } + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); + } + spin_unlock(&ioapic->lock); + + return ret; +} + +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, + int trigger_mode) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.vector != vector) + continue; + + /* + * We are dropping lock while calling ack notifiers because ack + * notifier callbacks for assigned devices call into IOAPIC + * recursively. Since remote_irr is cleared only after call + * to notifiers if the same vector will be delivered while lock + * is dropped it will be put into irr and will be delivered + * after ack notifier returns. + */ + spin_unlock(&ioapic->lock); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); + spin_lock(&ioapic->lock); + + if (trigger_mode != IOAPIC_LEVEL_TRIG) + continue; + + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << i))) + ioapic_service(ioapic, i); + } +} + +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + smp_rmb(); + if (!test_bit(vector, ioapic->handled_vectors)) + return; + spin_lock(&ioapic->lock); + __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); + spin_unlock(&ioapic->lock); +} + +static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_ioapic, dev); +} + +static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) +{ + return ((addr >= ioapic->base_address && + (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); +} + +static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 result; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("addr %lx\n", (unsigned long)addr); + ASSERT(!(addr & 0xf)); /* check alignment */ + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + result = ioapic->ioregsel; + break; + + case IOAPIC_REG_WINDOW: + result = ioapic_read_indirect(ioapic, addr, len); + break; + + default: + result = 0; + break; + } + spin_unlock(&ioapic->lock); + + switch (len) { + case 8: + *(u64 *) val = result; + break; + case 1: + case 2: + case 4: + memcpy(val, (char *)&result, len); + break; + default: + printk(KERN_WARNING "ioapic: wrong length %d\n", len); + } + return 0; +} + +static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct kvm_ioapic *ioapic = to_ioapic(this); + u32 data; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; + + ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", + (void*)addr, len, val); + ASSERT(!(addr & 0xf)); /* check alignment */ + + if (len == 4 || len == 8) + data = *(u32 *) val; + else { + printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); + return 0; + } + + addr &= 0xff; + spin_lock(&ioapic->lock); + switch (addr) { + case IOAPIC_REG_SELECT: + ioapic->ioregsel = data; + break; + + case IOAPIC_REG_WINDOW: + ioapic_write_indirect(ioapic, data); + break; +#ifdef CONFIG_IA64 + case IOAPIC_REG_EOI: + __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG); + break; +#endif + + default: + break; + } + spin_unlock(&ioapic->lock); + return 0; +} + +void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + ioapic->ioregsel = 0; + ioapic->irr = 0; + ioapic->id = 0; + update_handled_vectors(ioapic); +} + +static const struct kvm_io_device_ops ioapic_mmio_ops = { + .read = ioapic_mmio_read, + .write = ioapic_mmio_write, +}; + +int kvm_ioapic_init(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic; + int ret; + + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + if (!ioapic) + return -ENOMEM; + spin_lock_init(&ioapic->lock); + kvm->arch.vioapic = ioapic; + kvm_ioapic_reset(ioapic); + kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); + ioapic->kvm = kvm; + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kvm->arch.vioapic = NULL; + kfree(ioapic); + } + + return ret; +} + +void kvm_ioapic_destroy(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (ioapic) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); + } +} + +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); + spin_unlock(&ioapic->lock); + return 0; +} + +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + spin_lock(&ioapic->lock); + memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + update_handled_vectors(ioapic); + spin_unlock(&ioapic->lock); + return 0; +} diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h new file mode 100644 index 00000000..0b190c34 --- /dev/null +++ b/virt/kvm/ioapic.h @@ -0,0 +1,83 @@ +#ifndef __KVM_IO_APIC_H +#define __KVM_IO_APIC_H + +#include + +#include "iodev.h" + +struct kvm; +struct kvm_vcpu; + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 +#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +/*ioapic delivery mode*/ +#define IOAPIC_FIXED 0x0 +#define IOAPIC_LOWEST_PRIORITY 0x1 +#define IOAPIC_PMI 0x2 +#define IOAPIC_NMI 0x4 +#define IOAPIC_INIT 0x5 +#define IOAPIC_EXTINT 0x7 + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; + unsigned long irq_states[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; + void (*ack_notifier)(void *opaque, int irq); + spinlock_t lock; + DECLARE_BITMAP(handled_vectors, 256); +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vioapic; +} + +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode); +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_destroy(struct kvm *kvm); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +void kvm_ioapic_reset(struct kvm_ioapic *ioapic); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq); +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); + +#endif diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h new file mode 100644 index 00000000..12fd3caf --- /dev/null +++ b/virt/kvm/iodev.h @@ -0,0 +1,70 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_IODEV_H__ +#define __KVM_IODEV_H__ + +#include +#include + +struct kvm_io_device; + +/** + * kvm_io_device_ops are called under kvm slots_lock. + * read and write handlers return 0 if the transaction has been handled, + * or non-zero to have it passed to the next device. + **/ +struct kvm_io_device_ops { + int (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + int (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + void (*destructor)(struct kvm_io_device *this); +}; + + +struct kvm_io_device { + const struct kvm_io_device_ops *ops; +}; + +static inline void kvm_iodevice_init(struct kvm_io_device *dev, + const struct kvm_io_device_ops *ops) +{ + dev->ops = ops; +} + +static inline int kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, int l, void *v) +{ + return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP; +} + +static inline int kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, int l, const void *v) +{ + return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP; +} + +static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->ops->destructor) + dev->ops->destructor(dev); +} + +#endif /* __KVM_IODEV_H__ */ diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c new file mode 100644 index 00000000..fb0f6e46 --- /dev/null +++ b/virt/kvm/iommu.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Author: Allen M. Kay + * Author: Weidong Han + * Author: Ben-Ami Yassour + */ + +#include +#include +#include +#include +#include +#include + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long size) +{ + gfn_t end_gfn; + pfn_t pfn; + + pfn = gfn_to_pfn_memslot(kvm, slot, gfn); + end_gfn = gfn + (size >> PAGE_SHIFT); + gfn += 1; + + if (is_error_pfn(pfn)) + return pfn; + + while (gfn < end_gfn) + gfn_to_pfn_memslot(kvm, slot, gfn++); + + return pfn; +} + +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + gfn_t gfn, end_gfn; + pfn_t pfn; + int r = 0; + struct iommu_domain *domain = kvm->arch.iommu_domain; + int flags; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + gfn = slot->base_gfn; + end_gfn = gfn + slot->npages; + + flags = IOMMU_READ | IOMMU_WRITE; + if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) + flags |= IOMMU_CACHE; + + + while (gfn < end_gfn) { + unsigned long page_size; + + /* Check if already mapped */ + if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { + gfn += 1; + continue; + } + + /* Get the page size we could use to map */ + page_size = kvm_host_page_size(kvm, gfn); + + /* Make sure the page_size does not exceed the memslot */ + while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) + page_size >>= 1; + + /* Make sure gfn is aligned to the page size we want to map */ + while ((gfn << PAGE_SHIFT) & (page_size - 1)) + page_size >>= 1; + + /* + * Pin all pages we are about to map in memory. This is + * important because we unmap and unpin in 4kb steps later. + */ + pfn = kvm_pin_pages(kvm, slot, gfn, page_size); + if (is_error_pfn(pfn)) { + gfn += 1; + continue; + } + + /* Map into IO address space */ + r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), + get_order(page_size), flags); + if (r) { + printk(KERN_ERR "kvm_iommu_map_address:" + "iommu failed to map pfn=%llx\n", pfn); + goto unmap_pages; + } + + gfn += page_size >> PAGE_SHIFT; + + + } + + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, slot->base_gfn, gfn); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, idx, r = 0; + struct kvm_memslots *slots; + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + for (i = 0; i < slots->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, &slots->memslots[i]); + if (r) + break; + } + srcu_read_unlock(&kvm->srcu, idx); + + return r; +} + +int kvm_assign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct pci_dev *pdev = NULL; + struct iommu_domain *domain = kvm->arch.iommu_domain; + int r, last_flags; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + pdev = assigned_dev->dev; + if (pdev == NULL) + return -ENODEV; + + r = iommu_attach_device(domain, &pdev->dev); + if (r) { + printk(KERN_ERR "assign device %x:%x:%x.%x failed", + pci_domain_nr(pdev->bus), + pdev->bus->number, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn)); + return r; + } + + last_flags = kvm->arch.iommu_flags; + if (iommu_domain_has_cap(kvm->arch.iommu_domain, + IOMMU_CAP_CACHE_COHERENCY)) + kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY; + + /* Check if need to update IOMMU page table for guest memory */ + if ((last_flags ^ kvm->arch.iommu_flags) == + KVM_IOMMU_CACHE_COHERENCY) { + kvm_iommu_unmap_memslots(kvm); + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + } + + printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + return 0; +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +int kvm_deassign_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + struct pci_dev *pdev = NULL; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + pdev = assigned_dev->dev; + if (pdev == NULL) + return -ENODEV; + + iommu_detach_device(domain, &pdev->dev); + + printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + return 0; +} + +int kvm_iommu_map_guest(struct kvm *kvm) +{ + int r; + + if (!iommu_found()) { + printk(KERN_ERR "%s: iommu not found\n", __func__); + return -ENODEV; + } + + kvm->arch.iommu_domain = iommu_domain_alloc(); + if (!kvm->arch.iommu_domain) + return -ENOMEM; + + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + + return 0; + +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; ++i) + kvm_release_pfn_clean(pfn + i); +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + struct iommu_domain *domain; + gfn_t end_gfn, gfn; + pfn_t pfn; + u64 phys; + + domain = kvm->arch.iommu_domain; + end_gfn = base_gfn + npages; + gfn = base_gfn; + + /* check if iommu exists and in use */ + if (!domain) + return; + + while (gfn < end_gfn) { + unsigned long unmap_pages; + int order; + + /* Get physical address */ + phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); + pfn = phys >> PAGE_SHIFT; + + /* Unmap address from IO address space */ + order = iommu_unmap(domain, gfn_to_gpa(gfn), 0); + unmap_pages = 1ULL << order; + + /* Unpin all pages we just unmapped to not leak any memory */ + kvm_unpin_pages(kvm, pfn, unmap_pages); + + gfn += unmap_pages; + } +} + +void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int i, idx; + struct kvm_memslots *slots; + + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + + for (i = 0; i < slots->nmemslots; i++) + kvm_iommu_unmap_pages(kvm, &slots->memslots[i]); + + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct iommu_domain *domain = kvm->arch.iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + kvm_iommu_unmap_memslots(kvm); + iommu_domain_free(domain); + return 0; +} diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c new file mode 100644 index 00000000..9f614b4e --- /dev/null +++ b/virt/kvm/irq_comm.c @@ -0,0 +1,474 @@ +/* + * irq_comm.c: Common API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong + * + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + */ + +#include +#include +#include + +#include +#ifdef CONFIG_IA64 +#include +#endif + +#include "irq.h" + +#include "ioapic.h" + +static inline int kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + set_bit(irq_source_id, irq_state); + else + clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + +static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) +{ +#ifdef CONFIG_X86 + struct kvm_pic *pic = pic_irqchip(kvm); + level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], + irq_source_id, level); + return kvm_pic_set_irq(pic, e->irqchip.pin, level); +#else + return -1; +#endif +} + +static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], + irq_source_id, level); + + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); +} + +inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) +{ +#ifdef CONFIG_IA64 + return irq->delivery_mode == + (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); +#else + return irq->delivery_mode == APIC_DM_LOWEST; +#endif +} + +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq) +{ + int i, r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + + if (irq->dest_mode == 0 && irq->dest_id == 0xff && + kvm_is_dm_lowest_prio(irq)) + printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_is_dm_lowest_prio(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq); + } else if (kvm_lapic_enabled(vcpu)) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq); + + return r; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level) +{ + struct kvm_lapic_irq irq; + + if (!level) + return -1; + + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + + irq.dest_id = (e->msi.address_lo & + MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + irq.vector = (e->msi.data & + MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; + irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; + irq.delivery_mode = e->msi.data & 0x700; + irq.level = 1; + irq.shorthand = 0; + + /* TODO Deal with RH bit of MSI message address */ + return kvm_irq_delivery_to_apic(kvm, NULL, &irq); +} + +/* + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) +{ + struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; + int ret = -1, i = 0; + struct kvm_irq_routing_table *irq_rt; + struct hlist_node *n; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + rcu_read_lock(); + irq_rt = rcu_dereference(kvm->irq_routing); + if (irq < irq_rt->nr_rt_entries) + hlist_for_each_entry(e, n, &irq_rt->map[irq], link) + irq_set[i++] = *e; + rcu_read_unlock(); + + while(i--) { + int r; + r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level); + if (r < 0) + continue; + + ret = r + ((ret < 0) ? 0 : ret); + } + + return ret; +} + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + int gsi; + + trace_kvm_ack_irq(irqchip, pin); + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) + kian->irq_acked(kian); + rcu_read_unlock(); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_init_rcu(&kian->link); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); +} + +int kvm_request_irq_source_id(struct kvm *kvm) +{ + unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); + + if (irq_source_id >= BITS_PER_LONG) { + printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + irq_source_id = -EFAULT; + goto unlock; + } + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + set_bit(irq_source_id, bitmap); +unlock: + mutex_unlock(&kvm->irq_lock); + + return irq_source_id; +} + +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) +{ + int i; + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + + mutex_lock(&kvm->irq_lock); + if (irq_source_id < 0 || + irq_source_id >= BITS_PER_LONG) { + printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + goto unlock; + } + clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + if (!irqchip_in_kernel(kvm)) + goto unlock; + + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { + clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); + if (i >= 16) + continue; +#ifdef CONFIG_X86 + clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); +#endif + } +unlock: + mutex_unlock(&kvm->irq_lock); +} + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + kimn->irq = irq; + hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_rcu(&kimn->link); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); +} + +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask) +{ + struct kvm_irq_mask_notifier *kimn; + struct hlist_node *n; + int gsi; + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) + if (kimn->irq == gsi) + kimn->func(kimn, mask); + rcu_read_unlock(); +} + +void kvm_free_irq_routing(struct kvm *kvm) +{ + /* Called only during vm destruction. Nobody can use the pointer + at this stage */ + kfree(kvm->irq_routing); +} + +static int setup_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + int delta; + unsigned max_pin; + struct kvm_kernel_irq_routing_entry *ei; + struct hlist_node *n; + + /* + * Do not allow GSI to be mapped to the same irqchip more than once. + * Allow only one to one mapping between GSI and MSI. + */ + hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) + if (ei->type == KVM_IRQ_ROUTING_MSI || + ue->u.irqchip.irqchip == ei->irqchip.irqchip) + return r; + + e->gsi = ue->gsi; + e->type = ue->type; + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + delta = 0; + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_MASTER: + e->set = kvm_set_pic_irq; + max_pin = 16; + break; + case KVM_IRQCHIP_PIC_SLAVE: + e->set = kvm_set_pic_irq; + max_pin = 16; + delta = 8; + break; + case KVM_IRQCHIP_IOAPIC: + max_pin = KVM_IOAPIC_NUM_PINS; + e->set = kvm_set_ioapic_irq; + break; + default: + goto out; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin + delta; + if (e->irqchip.pin >= max_pin) + goto out; + rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + hlist_add_head(&e->link, &rt->map[e->gsi]); + r = 0; +out: + return r; +} + + +int kvm_set_irq_routing(struct kvm *kvm, + const struct kvm_irq_routing_entry *ue, + unsigned nr, + unsigned flags) +{ + struct kvm_irq_routing_table *new, *old; + u32 i, j, nr_rt_entries = 0; + int r; + + for (i = 0; i < nr; ++i) { + if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) + return -EINVAL; + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); + } + + nr_rt_entries += 1; + + new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) + + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), + GFP_KERNEL); + + if (!new) + return -ENOMEM; + + new->rt_entries = (void *)&new->map[nr_rt_entries]; + + new->nr_rt_entries = nr_rt_entries; + for (i = 0; i < 3; i++) + for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++) + new->chip[i][j] = -1; + + for (i = 0; i < nr; ++i) { + r = -EINVAL; + if (ue->flags) + goto out; + r = setup_routing_entry(new, &new->rt_entries[i], ue); + if (r) + goto out; + ++ue; + } + + mutex_lock(&kvm->irq_lock); + old = kvm->irq_routing; + kvm_irq_routing_update(kvm, new); + mutex_unlock(&kvm->irq_lock); + + synchronize_rcu(); + + new = old; + r = 0; + +out: + kfree(new); + return r; +} + +#define IOAPIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) } +#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) + +#ifdef CONFIG_X86 +# define PIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 } +# define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) +#else +# define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq) +#endif + +static const struct kvm_irq_routing_entry default_routing[] = { + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), +#ifdef CONFIG_IA64 + ROUTING_ENTRY1(24), ROUTING_ENTRY1(25), + ROUTING_ENTRY1(26), ROUTING_ENTRY1(27), + ROUTING_ENTRY1(28), ROUTING_ENTRY1(29), + ROUTING_ENTRY1(30), ROUTING_ENTRY1(31), + ROUTING_ENTRY1(32), ROUTING_ENTRY1(33), + ROUTING_ENTRY1(34), ROUTING_ENTRY1(35), + ROUTING_ENTRY1(36), ROUTING_ENTRY1(37), + ROUTING_ENTRY1(38), ROUTING_ENTRY1(39), + ROUTING_ENTRY1(40), ROUTING_ENTRY1(41), + ROUTING_ENTRY1(42), ROUTING_ENTRY1(43), + ROUTING_ENTRY1(44), ROUTING_ENTRY1(45), + ROUTING_ENTRY1(46), ROUTING_ENTRY1(47), +#endif +}; + +int kvm_setup_default_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, default_routing, + ARRAY_SIZE(default_routing), 0); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c new file mode 100644 index 00000000..6b39ba95 --- /dev/null +++ b/virt/kvm/kvm_main.c @@ -0,0 +1,2635 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "iodev.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "coalesced_mmio.h" +#include "async_pf.h" + +#define CREATE_TRACE_POINTS +#include + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +/* + * Ordering of locks: + * + * kvm->lock --> kvm->slots_lock --> kvm->irq_lock + */ + +DEFINE_RAW_SPINLOCK(kvm_lock); +LIST_HEAD(vm_list); + +static cpumask_var_t cpus_hardware_enabled; +static int kvm_usage_count = 0; +static atomic_t hardware_enable_failed; + +struct kmem_cache *kvm_vcpu_cache; +EXPORT_SYMBOL_GPL(kvm_vcpu_cache); + +static __read_mostly struct preempt_ops kvm_preempt_ops; + +struct dentry *kvm_debugfs_dir; + +static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg); +static int hardware_enable_all(void); +static void hardware_disable_all(void); + +static void kvm_io_bus_destroy(struct kvm_io_bus *bus); + +bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); + +static bool largepages_enabled = true; + +static struct page *hwpoison_page; +static pfn_t hwpoison_pfn; + +static struct page *fault_page; +static pfn_t fault_pfn; + +inline int kvm_is_mmio_pfn(pfn_t pfn) +{ + if (pfn_valid(pfn)) { + int reserved; + struct page *tail = pfn_to_page(pfn); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); + if (head != tail) { + /* + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. + */ + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + return PageReserved(tail); + } + + return true; +} + +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +void vcpu_load(struct kvm_vcpu *vcpu) +{ + int cpu; + + mutex_lock(&vcpu->mutex); + if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { + /* The thread running this VCPU changed. */ + struct pid *oldpid = vcpu->pid; + struct pid *newpid = get_task_pid(current, PIDTYPE_PID); + rcu_assign_pointer(vcpu->pid, newpid); + synchronize_rcu(); + put_pid(oldpid); + } + cpu = get_cpu(); + preempt_notifier_register(&vcpu->preempt_notifier); + kvm_arch_vcpu_load(vcpu, cpu); + put_cpu(); +} + +void vcpu_put(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + preempt_notifier_unregister(&vcpu->preempt_notifier); + preempt_enable(); + mutex_unlock(&vcpu->mutex); +} + +static void ack_flush(void *_completed) +{ +} + +static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) +{ + int i, cpu, me; + cpumask_var_t cpus; + bool called = true; + struct kvm_vcpu *vcpu; + + zalloc_cpumask_var(&cpus, GFP_ATOMIC); + + me = get_cpu(); + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_make_request(req, vcpu); + cpu = vcpu->cpu; + + /* Set ->requests bit before we read ->mode */ + smp_mb(); + + if (cpus != NULL && cpu != -1 && cpu != me && + kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) + cpumask_set_cpu(cpu, cpus); + } + if (unlikely(cpus == NULL)) + smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); + else if (!cpumask_empty(cpus)) + smp_call_function_many(cpus, ack_flush, NULL, 1); + else + called = false; + put_cpu(); + free_cpumask_var(cpus); + return called; +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int dirty_count = kvm->tlbs_dirty; + + smp_mb(); + if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + ++kvm->stat.remote_tlb_flush; + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); +} + +void kvm_reload_remote_mmus(struct kvm *kvm) +{ + make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); +} + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ + struct page *page; + int r; + + mutex_init(&vcpu->mutex); + vcpu->cpu = -1; + vcpu->kvm = kvm; + vcpu->vcpu_id = id; + vcpu->pid = NULL; + init_waitqueue_head(&vcpu->wq); + kvm_async_pf_vcpu_init(vcpu); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->run = page_address(page); + + r = kvm_arch_vcpu_init(vcpu); + if (r < 0) + goto fail_free_run; + return 0; + +fail_free_run: + free_page((unsigned long)vcpu->run); +fail: + return r; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_init); + +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + put_pid(vcpu->pid); + kvm_arch_vcpu_uninit(vcpu); + free_page((unsigned long)vcpu->run); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); + +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) +{ + return container_of(mn, struct kvm, mmu_notifier); +} + +static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int need_tlb_flush, idx; + + /* + * When ->invalidate_page runs, the linux pte has been zapped + * already but the page is still allocated until + * ->invalidate_page returns. So if we increase the sequence + * here the kvm page fault will notice if the spte can't be + * established because the page is going to be freed. If + * instead the kvm page fault establishes the spte before + * ->invalidate_page runs, kvm_unmap_hva will release it + * before returning. + * + * The sequence increase only need to be seen at spin_unlock + * time, and not at spin_lock time. + * + * Increasing the sequence after the spin_unlock would be + * unsafe because the kvm page fault could then establish the + * pte after kvm_unmap_hva returned, without noticing the page + * is going to be freed. + */ + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + kvm->mmu_notifier_seq++; + need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + /* we've to flush the tlb before the pages can be freed */ + if (need_tlb_flush) + kvm_flush_remote_tlbs(kvm); + +} + +static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + kvm->mmu_notifier_seq++; + kvm_set_spte_hva(kvm, address, pte); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + +static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int need_tlb_flush = 0, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + /* + * The count increase must become visible at unlock time as no + * spte can be established without taking the mmu_lock and + * count is also read inside the mmu_lock critical section. + */ + kvm->mmu_notifier_count++; + for (; start < end; start += PAGE_SIZE) + need_tlb_flush |= kvm_unmap_hva(kvm, start); + need_tlb_flush |= kvm->tlbs_dirty; + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + /* we've to flush the tlb before the pages can be freed */ + if (need_tlb_flush) + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + + spin_lock(&kvm->mmu_lock); + /* + * This sequence increase will notify the kvm page fault that + * the page that is going to be mapped in the spte could have + * been freed. + */ + kvm->mmu_notifier_seq++; + /* + * The above sequence increase must be visible before the + * below count decrease but both values are read by the kvm + * page fault under mmu_lock spinlock so we don't need to add + * a smb_wmb() here in between the two. + */ + kvm->mmu_notifier_count--; + spin_unlock(&kvm->mmu_lock); + + BUG_ON(kvm->mmu_notifier_count < 0); +} + +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + if (young) + kvm_flush_remote_tlbs(kvm); + + return young; +} + +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + +static void kvm_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + kvm_arch_flush_shadow(kvm); + srcu_read_unlock(&kvm->srcu, idx); +} + +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .invalidate_page = kvm_mmu_notifier_invalidate_page, + .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, + .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, + .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .test_young = kvm_mmu_notifier_test_young, + .change_pte = kvm_mmu_notifier_change_pte, + .release = kvm_mmu_notifier_release, +}; + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; + return mmu_notifier_register(&kvm->mmu_notifier, current->mm); +} + +#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + return 0; +} + +#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ + +static struct kvm *kvm_create_vm(void) +{ + int r, i; + struct kvm *kvm = kvm_arch_alloc_vm(); + + if (!kvm) + return ERR_PTR(-ENOMEM); + + r = kvm_arch_init_vm(kvm); + if (r) + goto out_err_nodisable; + + r = hardware_enable_all(); + if (r) + goto out_err_nodisable; + +#ifdef CONFIG_HAVE_KVM_IRQCHIP + INIT_HLIST_HEAD(&kvm->mask_notifier_list); + INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); +#endif + + r = -ENOMEM; + kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!kvm->memslots) + goto out_err_nosrcu; + if (init_srcu_struct(&kvm->srcu)) + goto out_err_nosrcu; + for (i = 0; i < KVM_NR_BUSES; i++) { + kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), + GFP_KERNEL); + if (!kvm->buses[i]) + goto out_err; + } + + spin_lock_init(&kvm->mmu_lock); + kvm->mm = current->mm; + atomic_inc(&kvm->mm->mm_count); + kvm_eventfd_init(kvm); + mutex_init(&kvm->lock); + mutex_init(&kvm->irq_lock); + mutex_init(&kvm->slots_lock); + atomic_set(&kvm->users_count, 1); + + r = kvm_init_mmu_notifier(kvm); + if (r) + goto out_err; + + raw_spin_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); + raw_spin_unlock(&kvm_lock); + + return kvm; + +out_err: + cleanup_srcu_struct(&kvm->srcu); +out_err_nosrcu: + hardware_disable_all(); +out_err_nodisable: + for (i = 0; i < KVM_NR_BUSES; i++) + kfree(kvm->buses[i]); + kfree(kvm->memslots); + kvm_arch_free_vm(kvm); + return ERR_PTR(r); +} + +static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + if (!memslot->dirty_bitmap) + return; + + if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) + vfree(memslot->dirty_bitmap_head); + else + kfree(memslot->dirty_bitmap_head); + + memslot->dirty_bitmap = NULL; + memslot->dirty_bitmap_head = NULL; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + if (!dont || free->rmap != dont->rmap) + vfree(free->rmap); + + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + kvm_destroy_dirty_bitmap(free); + + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { + vfree(free->lpage_info[i]); + free->lpage_info[i] = NULL; + } + } + + free->npages = 0; + free->rmap = NULL; +} + +void kvm_free_physmem(struct kvm *kvm) +{ + int i; + struct kvm_memslots *slots = kvm->memslots; + + for (i = 0; i < slots->nmemslots; ++i) + kvm_free_physmem_slot(&slots->memslots[i], NULL); + + kfree(kvm->memslots); +} + +static void kvm_destroy_vm(struct kvm *kvm) +{ + int i; + struct mm_struct *mm = kvm->mm; + + kvm_arch_sync_events(kvm); + raw_spin_lock(&kvm_lock); + list_del(&kvm->vm_list); + raw_spin_unlock(&kvm_lock); + kvm_free_irq_routing(kvm); + for (i = 0; i < KVM_NR_BUSES; i++) + kvm_io_bus_destroy(kvm->buses[i]); + kvm_coalesced_mmio_free(kvm); +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); +#else + kvm_arch_flush_shadow(kvm); +#endif + kvm_arch_destroy_vm(kvm); + kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); + kvm_arch_free_vm(kvm); + hardware_disable_all(); + mmdrop(mm); +} + +void kvm_get_kvm(struct kvm *kvm) +{ + atomic_inc(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm); + +void kvm_put_kvm(struct kvm *kvm) +{ + if (atomic_dec_and_test(&kvm->users_count)) + kvm_destroy_vm(kvm); +} +EXPORT_SYMBOL_GPL(kvm_put_kvm); + + +static int kvm_vm_release(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = filp->private_data; + + kvm_irqfd_release(kvm); + + kvm_put_kvm(kvm); + return 0; +} + +#ifndef CONFIG_S390 +/* + * Allocation size is twice as large as the actual dirty bitmap size. + * This makes it possible to do double buffering: see x86's + * kvm_vm_ioctl_get_dirty_log(). + */ +static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) +{ + unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); + + if (dirty_bytes > PAGE_SIZE) + memslot->dirty_bitmap = vzalloc(dirty_bytes); + else + memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); + + if (!memslot->dirty_bitmap) + return -ENOMEM; + + memslot->dirty_bitmap_head = memslot->dirty_bitmap; + return 0; +} +#endif /* !CONFIG_S390 */ + +/* + * Allocate some memory and give it an address in the guest physical address + * space. + * + * Discontiguous memory is allowed, mostly for framebuffers. + * + * Must be called holding mmap_sem for write. + */ +int __kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r; + gfn_t base_gfn; + unsigned long npages; + unsigned long i; + struct kvm_memory_slot *memslot; + struct kvm_memory_slot old, new; + struct kvm_memslots *slots, *old_memslots; + + r = -EINVAL; + /* General sanity checks */ + if (mem->memory_size & (PAGE_SIZE - 1)) + goto out; + if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + /* We can read the guest memory with __xxx_user() later on. */ + if (user_alloc && + ((mem->userspace_addr & (PAGE_SIZE - 1)) || + !access_ok(VERIFY_WRITE, + (void __user *)(unsigned long)mem->userspace_addr, + mem->memory_size))) + goto out; + if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) + goto out; + if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + goto out; + + memslot = &kvm->memslots->memslots[mem->slot]; + base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; + npages = mem->memory_size >> PAGE_SHIFT; + + r = -EINVAL; + if (npages > KVM_MEM_MAX_NR_PAGES) + goto out; + + if (!npages) + mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + + new = old = *memslot; + + new.id = mem->slot; + new.base_gfn = base_gfn; + new.npages = npages; + new.flags = mem->flags; + + /* Disallow changing a memory slot's size. */ + r = -EINVAL; + if (npages && old.npages && npages != old.npages) + goto out_free; + + /* Check for overlaps */ + r = -EEXIST; + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; + + if (s == memslot || !s->npages) + continue; + if (!((base_gfn + npages <= s->base_gfn) || + (base_gfn >= s->base_gfn + s->npages))) + goto out_free; + } + + /* Free page dirty bitmap if unneeded */ + if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + new.dirty_bitmap = NULL; + + r = -ENOMEM; + + /* Allocate if a slot is being created */ +#ifndef CONFIG_S390 + if (npages && !new.rmap) { + new.rmap = vzalloc(npages * sizeof(*new.rmap)); + + if (!new.rmap) + goto out_free; + + new.user_alloc = user_alloc; + new.userspace_addr = mem->userspace_addr; + } + if (!npages) + goto skip_lpage; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + unsigned long j; + int lpages; + int level = i + 2; + + /* Avoid unused variable warning if no large pages */ + (void)level; + + if (new.lpage_info[i]) + continue; + + lpages = 1 + ((base_gfn + npages - 1) + >> KVM_HPAGE_GFN_SHIFT(level)); + lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); + + new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); + + if (!new.lpage_info[i]) + goto out_free; + + if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + new.lpage_info[i][0].write_count = 1; + if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + new.lpage_info[i][lpages - 1].write_count = 1; + ugfn = new.userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !largepages_enabled) + for (j = 0; j < lpages; ++j) + new.lpage_info[i][j].write_count = 1; + } + +skip_lpage: + + /* Allocate page dirty bitmap if needed */ + if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + if (kvm_create_dirty_bitmap(&new) < 0) + goto out_free; + /* destroy any largepage mappings for dirty tracking */ + } +#else /* not defined CONFIG_S390 */ + new.user_alloc = user_alloc; + if (user_alloc) + new.userspace_addr = mem->userspace_addr; +#endif /* not defined CONFIG_S390 */ + + if (!npages) { + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + slots->generation++; + slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; + + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + /* From this point no new shadow pages pointing to a deleted + * memslot will be created. + * + * validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_roots) + */ + kvm_arch_flush_shadow(kvm); + kfree(old_memslots); + } + + r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); + if (r) + goto out_free; + + /* map/unmap the pages in iommu page table */ + if (npages) { + r = kvm_iommu_map_pages(kvm, &new); + if (r) + goto out_free; + } else + kvm_iommu_unmap_pages(kvm, &old); + + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + slots->generation++; + + /* actual memory is freed via old in kvm_free_physmem_slot below */ + if (!npages) { + new.rmap = NULL; + new.dirty_bitmap = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) + new.lpage_info[i] = NULL; + } + + slots->memslots[mem->slot] = new; + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + + kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); + + kvm_free_physmem_slot(&old, &new); + kfree(old_memslots); + + return 0; + +out_free: + kvm_free_physmem_slot(&new, &old); +out: + return r; + +} +EXPORT_SYMBOL_GPL(__kvm_set_memory_region); + +int kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r; + + mutex_lock(&kvm->slots_lock); + r = __kvm_set_memory_region(kvm, mem, user_alloc); + mutex_unlock(&kvm->slots_lock); + return r; +} +EXPORT_SYMBOL_GPL(kvm_set_memory_region); + +int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct + kvm_userspace_memory_region *mem, + int user_alloc) +{ + if (mem->slot >= KVM_MEMORY_SLOTS) + return -EINVAL; + return kvm_set_memory_region(kvm, mem, user_alloc); +} + +int kvm_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log, int *is_dirty) +{ + struct kvm_memory_slot *memslot; + int r, i; + unsigned long n; + unsigned long any = 0; + + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + for (i = 0; !any && i < n/sizeof(long); ++i) + any = memslot->dirty_bitmap[i]; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + if (any) + *is_dirty = 1; + + r = 0; +out: + return r; +} + +void kvm_disable_largepages(void) +{ + largepages_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_largepages); + +int is_error_page(struct page *page) +{ + return page == bad_page || page == hwpoison_page || page == fault_page; +} +EXPORT_SYMBOL_GPL(is_error_page); + +int is_error_pfn(pfn_t pfn) +{ + return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; +} +EXPORT_SYMBOL_GPL(is_error_pfn); + +int is_hwpoison_pfn(pfn_t pfn) +{ + return pfn == hwpoison_pfn; +} +EXPORT_SYMBOL_GPL(is_hwpoison_pfn); + +int is_fault_pfn(pfn_t pfn) +{ + return pfn == fault_pfn; +} +EXPORT_SYMBOL_GPL(is_fault_pfn); + +static inline unsigned long bad_hva(void) +{ + return PAGE_OFFSET; +} + +int kvm_is_error_hva(unsigned long addr) +{ + return addr == bad_hva(); +} +EXPORT_SYMBOL_GPL(kvm_is_error_hva); + +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, + gfn_t gfn) +{ + int i; + + for (i = 0; i < slots->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return NULL; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_memslot(kvm_memslots(kvm), gfn); +} +EXPORT_SYMBOL_GPL(gfn_to_memslot); + +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memslots *slots = kvm_memslots(kvm); + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (memslot->flags & KVM_MEMSLOT_INVALID) + continue; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); + +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +{ + struct vm_area_struct *vma; + unsigned long addr, size; + + size = PAGE_SIZE; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return PAGE_SIZE; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + if (!vma) + goto out; + + size = vma_kernel_pagesize(vma); + +out: + up_read(¤t->mm->mmap_sem); + + return size; +} + +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, + gfn_t *nr_pages) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return bad_hva(); + + if (nr_pages) + *nr_pages = slot->npages - (gfn - slot->base_gfn); + + return gfn_to_hva_memslot(slot, gfn); +} + +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +{ + return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_hva); + +static pfn_t get_fault_pfn(void) +{ + get_page(fault_page); + return fault_pfn; +} + +int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int write, struct page **page) +{ + int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + + if (write) + flags |= FOLL_WRITE; + + return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); +} + +static inline int check_user_page_hwpoison(unsigned long addr) +{ + int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + + rc = __get_user_pages(current, current->mm, addr, 1, + flags, NULL, NULL, NULL); + return rc == -EHWPOISON; +} + +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, + bool *async, bool write_fault, bool *writable) +{ + struct page *page[1]; + int npages = 0; + pfn_t pfn; + + /* we can do it either atomically or asynchronously, not both */ + BUG_ON(atomic && async); + + BUG_ON(!write_fault && !writable); + + if (writable) + *writable = true; + + if (atomic || async) + npages = __get_user_pages_fast(addr, 1, 1, page); + + if (unlikely(npages != 1) && !atomic) { + might_sleep(); + + if (writable) + *writable = write_fault; + + if (async) { + down_read(¤t->mm->mmap_sem); + npages = get_user_page_nowait(current, current->mm, + addr, write_fault, page); + up_read(¤t->mm->mmap_sem); + } else + npages = get_user_pages_fast(addr, 1, write_fault, + page); + + /* map read fault as writable if possible */ + if (unlikely(!write_fault) && npages == 1) { + struct page *wpage[1]; + + npages = __get_user_pages_fast(addr, 1, 1, wpage); + if (npages == 1) { + *writable = true; + put_page(page[0]); + page[0] = wpage[0]; + } + npages = 1; + } + } + + if (unlikely(npages != 1)) { + struct vm_area_struct *vma; + + if (atomic) + return get_fault_pfn(); + + down_read(¤t->mm->mmap_sem); + if (npages == -EHWPOISON || + (!async && check_user_page_hwpoison(addr))) { + up_read(¤t->mm->mmap_sem); + get_page(hwpoison_page); + return page_to_pfn(hwpoison_page); + } + + vma = find_vma_intersection(current->mm, addr, addr+1); + + if (vma == NULL) + pfn = get_fault_pfn(); + else if ((vma->vm_flags & VM_PFNMAP)) { + pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + BUG_ON(!kvm_is_mmio_pfn(pfn)); + } else { + if (async && (vma->vm_flags & VM_WRITE)) + *async = true; + pfn = get_fault_pfn(); + } + up_read(¤t->mm->mmap_sem); + } else + pfn = page_to_pfn(page[0]); + + return pfn; +} + +pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) +{ + return hva_to_pfn(kvm, addr, true, NULL, true, NULL); +} +EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); + +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, + bool write_fault, bool *writable) +{ + unsigned long addr; + + if (async) + *async = false; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); +} + +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); + +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, + bool write_fault, bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_async); + +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn); + +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, + bool *writable) +{ + return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); + +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + unsigned long addr = gfn_to_hva_memslot(slot, gfn); + return hva_to_pfn(kvm, addr, false, NULL, true, NULL); +} + +int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, + int nr_pages) +{ + unsigned long addr; + gfn_t entry; + + addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); + if (kvm_is_error_hva(addr)) + return -1; + + if (entry < nr_pages) + return 0; + + return __get_user_pages_fast(addr, nr_pages, 1, pages); +} +EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + pfn_t pfn; + + pfn = gfn_to_pfn(kvm, gfn); + if (!kvm_is_mmio_pfn(pfn)) + return pfn_to_page(pfn); + + WARN_ON(kvm_is_mmio_pfn(pfn)); + + get_page(bad_page); + return bad_page; +} + +EXPORT_SYMBOL_GPL(gfn_to_page); + +void kvm_release_page_clean(struct page *page) +{ + kvm_release_pfn_clean(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_pfn_clean(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + put_page(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); + +void kvm_release_page_dirty(struct page *page) +{ + kvm_release_pfn_dirty(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + +void kvm_release_pfn_dirty(pfn_t pfn) +{ + kvm_set_pfn_dirty(pfn); + kvm_release_pfn_clean(pfn); +} +EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); + +void kvm_set_page_dirty(struct page *page) +{ + kvm_set_pfn_dirty(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_set_page_dirty); + +void kvm_set_pfn_dirty(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + SetPageDirty(page); + } +} +EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); + +void kvm_set_pfn_accessed(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + mark_page_accessed(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); + +void kvm_get_pfn(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + get_page(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_get_pfn); + +static int next_segment(unsigned long len, int offset) +{ + if (len > PAGE_SIZE - offset) + return PAGE_SIZE - offset; + else + return len; +} + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = __copy_from_user(data, (void __user *)addr + offset, len); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page); + +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest); + +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len) +{ + int r; + unsigned long addr; + gfn_t gfn = gpa >> PAGE_SHIFT; + int offset = offset_in_page(gpa); + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + pagefault_disable(); + r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); + pagefault_enable(); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL(kvm_read_guest_atomic); + +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copy_to_user((void __user *)addr + offset, data, len); + if (r) + return -EFAULT; + mark_page_dirty(kvm, gfn); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_page); + +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} + +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int offset = offset_in_page(gpa); + gfn_t gfn = gpa >> PAGE_SHIFT; + + ghc->gpa = gpa; + ghc->generation = slots->generation; + ghc->memslot = __gfn_to_memslot(slots, gfn); + ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); + if (!kvm_is_error_hva(ghc->hva)) + ghc->hva += offset; + else + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); + +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + int r; + + if (slots->generation != ghc->generation) + kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); + + if (kvm_is_error_hva(ghc->hva)) + return -EFAULT; + + r = copy_to_user((void __user *)ghc->hva, data, len); + if (r) + return -EFAULT; + mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); + +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +{ + return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, + offset, len); +} +EXPORT_SYMBOL_GPL(kvm_clear_guest_page); + +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_clear_guest_page(kvm, gfn, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_clear_guest); + +void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, + gfn_t gfn) +{ + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; + + __set_bit_le(rel_gfn, memslot->dirty_bitmap); + } +} + +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + memslot = gfn_to_memslot(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); +} + +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +void kvm_vcpu_block(struct kvm_vcpu *vcpu) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + + if (kvm_arch_vcpu_runnable(vcpu)) { + kvm_make_request(KVM_REQ_UNHALT, vcpu); + break; + } + if (kvm_cpu_has_pending_timer(vcpu)) + break; + if (signal_pending(current)) + break; + + schedule(); + } + + finish_wait(&vcpu->wq, &wait); +} + +void kvm_resched(struct kvm_vcpu *vcpu) +{ + if (!need_resched()) + return; + cond_resched(); +} +EXPORT_SYMBOL_GPL(kvm_resched); + +void kvm_vcpu_on_spin(struct kvm_vcpu *me) +{ + struct kvm *kvm = me->kvm; + struct kvm_vcpu *vcpu; + int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + int yielded = 0; + int pass; + int i; + + /* + * We boost the priority of a VCPU that is runnable but not + * currently running, because it got preempted by something + * else and called schedule in __vcpu_run. Hopefully that + * VCPU is holding the lock that we need and will release it. + * We approximate round-robin by starting at the last boosted VCPU. + */ + for (pass = 0; pass < 2 && !yielded; pass++) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct task_struct *task = NULL; + struct pid *pid; + if (!pass && i < last_boosted_vcpu) { + i = last_boosted_vcpu; + continue; + } else if (pass && i > last_boosted_vcpu) + break; + if (vcpu == me) + continue; + if (waitqueue_active(&vcpu->wq)) + continue; + rcu_read_lock(); + pid = rcu_dereference(vcpu->pid); + if (pid) + task = get_pid_task(vcpu->pid, PIDTYPE_PID); + rcu_read_unlock(); + if (!task) + continue; + if (task->flags & PF_VCPU) { + put_task_struct(task); + continue; + } + if (yield_to(task, 1)) { + put_task_struct(task); + kvm->last_boosted_vcpu = i; + yielded = 1; + break; + } + put_task_struct(task); + } + } +} +EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); + +static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvm_vcpu *vcpu = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff == 0) + page = virt_to_page(vcpu->run); +#ifdef CONFIG_X86 + else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) + page = virt_to_page(vcpu->arch.pio_data); +#endif +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) + page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); +#endif + else + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_vcpu_vm_ops = { + .fault = kvm_vcpu_fault, +}; + +static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vcpu_vm_ops; + return 0; +} + +static int kvm_vcpu_release(struct inode *inode, struct file *filp) +{ + struct kvm_vcpu *vcpu = filp->private_data; + + kvm_put_kvm(vcpu->kvm); + return 0; +} + +static struct file_operations kvm_vcpu_fops = { + .release = kvm_vcpu_release, + .unlocked_ioctl = kvm_vcpu_ioctl, + .compat_ioctl = kvm_vcpu_ioctl, + .mmap = kvm_vcpu_mmap, + .llseek = noop_llseek, +}; + +/* + * Allocates an inode for the vcpu. + */ +static int create_vcpu_fd(struct kvm_vcpu *vcpu) +{ + return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); +} + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) +{ + int r; + struct kvm_vcpu *vcpu, *v; + + vcpu = kvm_arch_vcpu_create(kvm, id); + if (IS_ERR(vcpu)) + return PTR_ERR(vcpu); + + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + + r = kvm_arch_vcpu_setup(vcpu); + if (r) + return r; + + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { + r = -EINVAL; + goto vcpu_destroy; + } + + kvm_for_each_vcpu(r, v, kvm) + if (v->vcpu_id == id) { + r = -EEXIST; + goto vcpu_destroy; + } + + BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); + + /* Now it's all set up, let userspace reach it */ + kvm_get_kvm(kvm); + r = create_vcpu_fd(vcpu); + if (r < 0) { + kvm_put_kvm(kvm); + goto vcpu_destroy; + } + + kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + smp_wmb(); + atomic_inc(&kvm->online_vcpus); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + if (kvm->bsp_vcpu_id == id) + kvm->bsp_vcpu = vcpu; +#endif + mutex_unlock(&kvm->lock); + return r; + +vcpu_destroy: + mutex_unlock(&kvm->lock); + kvm_arch_vcpu_destroy(vcpu); + return r; +} + +static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +{ + if (sigset) { + sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; + return 0; +} + +static long kvm_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + struct kvm_fpu *fpu = NULL; + struct kvm_sregs *kvm_sregs = NULL; + + if (vcpu->kvm->mm != current->mm) + return -EIO; + +#if defined(CONFIG_S390) || defined(CONFIG_PPC) + /* + * Special cases: vcpu ioctls that are asynchronous to vcpu execution, + * so vcpu_load() would break it. + */ + if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) + return kvm_arch_vcpu_ioctl(filp, ioctl, arg); +#endif + + + vcpu_load(vcpu); + switch (ioctl) { + case KVM_RUN: + r = -EINVAL; + if (arg) + goto out; + r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); + break; + case KVM_GET_REGS: { + struct kvm_regs *kvm_regs; + + r = -ENOMEM; + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + if (!kvm_regs) + goto out; + r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); + if (r) + goto out_free1; + r = -EFAULT; + if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) + goto out_free1; + r = 0; +out_free1: + kfree(kvm_regs); + break; + } + case KVM_SET_REGS: { + struct kvm_regs *kvm_regs; + + r = -ENOMEM; + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + if (!kvm_regs) + goto out; + r = -EFAULT; + if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) + goto out_free2; + r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); + if (r) + goto out_free2; + r = 0; +out_free2: + kfree(kvm_regs); + break; + } + case KVM_GET_SREGS: { + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; + r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS: { + kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; + r = -EFAULT; + if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) + goto out; + r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_MP_STATE: { + struct kvm_mp_state mp_state; + + r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &mp_state, sizeof mp_state)) + goto out; + r = 0; + break; + } + case KVM_SET_MP_STATE: { + struct kvm_mp_state mp_state; + + r = -EFAULT; + if (copy_from_user(&mp_state, argp, sizeof mp_state)) + goto out; + r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); + if (r) + goto out; + r = 0; + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + + r = -EFAULT; + if (copy_from_user(&tr, argp, sizeof tr)) + goto out; + r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tr, sizeof tr)) + goto out; + r = 0; + break; + } + case KVM_SET_GUEST_DEBUG: { + struct kvm_guest_debug dbg; + + r = -EFAULT; + if (copy_from_user(&dbg, argp, sizeof dbg)) + goto out; + r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask __user *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + sigset_t sigset, *p; + + p = NULL; + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof sigset) + goto out; + r = -EFAULT; + if (copy_from_user(&sigset, sigmask_arg->sigset, + sizeof sigset)) + goto out; + p = &sigset; + } + r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); + break; + } + case KVM_GET_FPU: { + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; + r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) + goto out; + r = 0; + break; + } + case KVM_SET_FPU: { + fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; + r = -EFAULT; + if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) + goto out; + r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); + if (r) + goto out; + r = 0; + break; + } + default: + r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); + } +out: + vcpu_put(vcpu); + kfree(fpu); + kfree(kvm_sregs); + return r; +} + +static long kvm_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + + if (kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_CREATE_VCPU: + r = kvm_vm_ioctl_create_vcpu(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_USER_MEMORY_REGION: { + struct kvm_userspace_memory_region kvm_userspace_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_userspace_mem, argp, + sizeof kvm_userspace_mem)) + goto out; + + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); + if (r) + goto out; + break; + } + case KVM_GET_DIRTY_LOG: { + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof log)) + goto out; + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + case KVM_REGISTER_COALESCED_MMIO: { + struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; + if (copy_from_user(&zone, argp, sizeof zone)) + goto out; + r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); + if (r) + goto out; + r = 0; + break; + } + case KVM_UNREGISTER_COALESCED_MMIO: { + struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; + if (copy_from_user(&zone, argp, sizeof zone)) + goto out; + r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); + if (r) + goto out; + r = 0; + break; + } +#endif + case KVM_IRQFD: { + struct kvm_irqfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); + break; + } + case KVM_IOEVENTFD: { + struct kvm_ioeventfd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_ioeventfd(kvm, &data); + break; + } +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_SET_BOOT_CPU_ID: + r = 0; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); + break; +#endif + default: + r = kvm_arch_vm_ioctl(filp, ioctl, arg); + if (r == -ENOTTY) + r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + } +out: + return r; +} + +#ifdef CONFIG_COMPAT +struct compat_kvm_dirty_log { + __u32 slot; + __u32 padding1; + union { + compat_uptr_t dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +static long kvm_vm_compat_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + int r; + + if (kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_GET_DIRTY_LOG: { + struct compat_kvm_dirty_log compat_log; + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&compat_log, (void __user *)arg, + sizeof(compat_log))) + goto out; + log.slot = compat_log.slot; + log.padding1 = compat_log.padding1; + log.padding2 = compat_log.padding2; + log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); + + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } + default: + r = kvm_vm_ioctl(filp, ioctl, arg); + } + +out: + return r; +} +#endif + +static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page[1]; + unsigned long addr; + int npages; + gfn_t gfn = vmf->pgoff; + struct kvm *kvm = vma->vm_file->private_data; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return VM_FAULT_SIGBUS; + + npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, + NULL); + if (unlikely(npages != 1)) + return VM_FAULT_SIGBUS; + + vmf->page = page[0]; + return 0; +} + +static const struct vm_operations_struct kvm_vm_vm_ops = { + .fault = kvm_vm_fault, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} + +static struct file_operations kvm_vm_fops = { + .release = kvm_vm_release, + .unlocked_ioctl = kvm_vm_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = kvm_vm_compat_ioctl, +#endif + .mmap = kvm_vm_mmap, + .llseek = noop_llseek, +}; + +static int kvm_dev_ioctl_create_vm(void) +{ + int r; + struct kvm *kvm; + + kvm = kvm_create_vm(); + if (IS_ERR(kvm)) + return PTR_ERR(kvm); +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + r = kvm_coalesced_mmio_init(kvm); + if (r < 0) { + kvm_put_kvm(kvm); + return r; + } +#endif + r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); + if (r < 0) + kvm_put_kvm(kvm); + + return r; +} + +static long kvm_dev_ioctl_check_extension_generic(long arg) +{ + switch (arg) { + case KVM_CAP_USER_MEMORY: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_CAP_SET_BOOT_CPU_ID: +#endif + case KVM_CAP_INTERNAL_ERROR_DATA: + return 1; +#ifdef CONFIG_HAVE_KVM_IRQCHIP + case KVM_CAP_IRQ_ROUTING: + return KVM_MAX_IRQ_ROUTES; +#endif + default: + break; + } + return kvm_dev_ioctl_check_extension(arg); +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + long r = -EINVAL; + + switch (ioctl) { + case KVM_GET_API_VERSION: + r = -EINVAL; + if (arg) + goto out; + r = KVM_API_VERSION; + break; + case KVM_CREATE_VM: + r = -EINVAL; + if (arg) + goto out; + r = kvm_dev_ioctl_create_vm(); + break; + case KVM_CHECK_EXTENSION: + r = kvm_dev_ioctl_check_extension_generic(arg); + break; + case KVM_GET_VCPU_MMAP_SIZE: + r = -EINVAL; + if (arg) + goto out; + r = PAGE_SIZE; /* struct kvm_run */ +#ifdef CONFIG_X86 + r += PAGE_SIZE; /* pio data page */ +#endif +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + r += PAGE_SIZE; /* coalesced mmio ring page */ +#endif + break; + case KVM_TRACE_ENABLE: + case KVM_TRACE_PAUSE: + case KVM_TRACE_DISABLE: + r = -EOPNOTSUPP; + break; + default: + return kvm_arch_dev_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static struct file_operations kvm_chardev_ops = { + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice kvm_dev = { + KVM_MINOR, + "kvm", + &kvm_chardev_ops, +}; + +static void hardware_enable_nolock(void *junk) +{ + int cpu = raw_smp_processor_id(); + int r; + + if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) + return; + + cpumask_set_cpu(cpu, cpus_hardware_enabled); + + r = kvm_arch_hardware_enable(NULL); + + if (r) { + cpumask_clear_cpu(cpu, cpus_hardware_enabled); + atomic_inc(&hardware_enable_failed); + printk(KERN_INFO "kvm: enabling virtualization on " + "CPU%d failed\n", cpu); + } +} + +static void hardware_enable(void *junk) +{ + raw_spin_lock(&kvm_lock); + hardware_enable_nolock(junk); + raw_spin_unlock(&kvm_lock); +} + +static void hardware_disable_nolock(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) + return; + cpumask_clear_cpu(cpu, cpus_hardware_enabled); + kvm_arch_hardware_disable(NULL); +} + +static void hardware_disable(void *junk) +{ + raw_spin_lock(&kvm_lock); + hardware_disable_nolock(junk); + raw_spin_unlock(&kvm_lock); +} + +static void hardware_disable_all_nolock(void) +{ + BUG_ON(!kvm_usage_count); + + kvm_usage_count--; + if (!kvm_usage_count) + on_each_cpu(hardware_disable_nolock, NULL, 1); +} + +static void hardware_disable_all(void) +{ + raw_spin_lock(&kvm_lock); + hardware_disable_all_nolock(); + raw_spin_unlock(&kvm_lock); +} + +static int hardware_enable_all(void) +{ + int r = 0; + + raw_spin_lock(&kvm_lock); + + kvm_usage_count++; + if (kvm_usage_count == 1) { + atomic_set(&hardware_enable_failed, 0); + on_each_cpu(hardware_enable_nolock, NULL, 1); + + if (atomic_read(&hardware_enable_failed)) { + hardware_disable_all_nolock(); + r = -EBUSY; + } + } + + raw_spin_unlock(&kvm_lock); + + return r; +} + +static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, + void *v) +{ + int cpu = (long)v; + + if (!kvm_usage_count) + return NOTIFY_OK; + + val &= ~CPU_TASKS_FROZEN; + switch (val) { + case CPU_DYING: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + hardware_disable(NULL); + break; + case CPU_STARTING: + printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", + cpu); + hardware_enable(NULL); + break; + } + return NOTIFY_OK; +} + + +asmlinkage void kvm_spurious_fault(void) +{ + /* Fault while not rebooting. We want the trace. */ + BUG(); +} +EXPORT_SYMBOL_GPL(kvm_spurious_fault); + +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + * + * And Intel TXT required VMX off for all cpu when system shutdown. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + kvm_rebooting = true; + on_each_cpu(hardware_disable_nolock, NULL, 1); + return NOTIFY_OK; +} + +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, +}; + +static void kvm_io_bus_destroy(struct kvm_io_bus *bus) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + kvm_iodevice_destructor(pos); + } + kfree(bus); +} + +/* kvm_io_bus_write - called under kvm->slots_lock */ +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, const void *val) +{ + int i; + struct kvm_io_bus *bus; + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} + +/* kvm_io_bus_read - called under kvm->slots_lock */ +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, void *val) +{ + int i; + struct kvm_io_bus *bus; + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} + +/* Caller must hold slots_lock. */ +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) +{ + struct kvm_io_bus *new_bus, *bus; + + bus = kvm->buses[bus_idx]; + if (bus->dev_count > NR_IOBUS_DEVS-1) + return -ENOSPC; + + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + new_bus->devs[new_bus->dev_count++] = dev; + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + + return 0; +} + +/* Caller must hold slots_lock. */ +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) +{ + int i, r; + struct kvm_io_bus *new_bus, *bus; + + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + + bus = kvm->buses[bus_idx]; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + + r = -ENOENT; + for (i = 0; i < new_bus->dev_count; i++) + if (new_bus->devs[i] == dev) { + r = 0; + new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; + break; + } + + if (r) { + kfree(new_bus); + return r; + } + + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + return r; +} + +static struct notifier_block kvm_cpu_notifier = { + .notifier_call = kvm_cpu_hotplug, +}; + +static int vm_stat_get(void *_offset, u64 *val) +{ + unsigned offset = (long)_offset; + struct kvm *kvm; + + *val = 0; + raw_spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + *val += *(u32 *)((void *)kvm + offset); + raw_spin_unlock(&kvm_lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); + +static int vcpu_stat_get(void *_offset, u64 *val) +{ + unsigned offset = (long)_offset; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + + *val = 0; + raw_spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + *val += *(u32 *)((void *)vcpu + offset); + + raw_spin_unlock(&kvm_lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); + +static const struct file_operations *stat_fops[] = { + [KVM_STAT_VCPU] = &vcpu_stat_fops, + [KVM_STAT_VM] = &vm_stat_fops, +}; + +static void kvm_init_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); + for (p = debugfs_entries; p->name; ++p) + p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, + (void *)(long)p->offset, + stat_fops[p->kind]); +} + +static void kvm_exit_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + for (p = debugfs_entries; p->name; ++p) + debugfs_remove(p->dentry); + debugfs_remove(kvm_debugfs_dir); +} + +static int kvm_suspend(void) +{ + if (kvm_usage_count) + hardware_disable_nolock(NULL); + return 0; +} + +static void kvm_resume(void) +{ + if (kvm_usage_count) { + WARN_ON(raw_spin_is_locked(&kvm_lock)); + hardware_enable_nolock(NULL); + } +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +struct page *bad_page; +pfn_t bad_pfn; + +static inline +struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) +{ + return container_of(pn, struct kvm_vcpu, preempt_notifier); +} + +static void kvm_sched_in(struct preempt_notifier *pn, int cpu) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_load(vcpu, cpu); +} + +static void kvm_sched_out(struct preempt_notifier *pn, + struct task_struct *next) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_put(vcpu); +} + +int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + struct module *module) +{ + int r; + int cpu; + + r = kvm_arch_init(opaque); + if (r) + goto out_fail; + + bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (bad_page == NULL) { + r = -ENOMEM; + goto out; + } + + bad_pfn = page_to_pfn(bad_page); + + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (hwpoison_page == NULL) { + r = -ENOMEM; + goto out_free_0; + } + + hwpoison_pfn = page_to_pfn(hwpoison_page); + + fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (fault_page == NULL) { + r = -ENOMEM; + goto out_free_0; + } + + fault_pfn = page_to_pfn(fault_page); + + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { + r = -ENOMEM; + goto out_free_0; + } + + r = kvm_arch_hardware_setup(); + if (r < 0) + goto out_free_0a; + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_arch_check_processor_compat, + &r, 1); + if (r < 0) + goto out_free_1; + } + + r = register_cpu_notifier(&kvm_cpu_notifier); + if (r) + goto out_free_2; + register_reboot_notifier(&kvm_reboot_notifier); + + /* A kmem cache lets us meet the alignment requirements of fx_save. */ + if (!vcpu_align) + vcpu_align = __alignof__(struct kvm_vcpu); + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, + 0, NULL); + if (!kvm_vcpu_cache) { + r = -ENOMEM; + goto out_free_3; + } + + r = kvm_async_pf_init(); + if (r) + goto out_free; + + kvm_chardev_ops.owner = module; + kvm_vm_fops.owner = module; + kvm_vcpu_fops.owner = module; + + r = misc_register(&kvm_dev); + if (r) { + printk(KERN_ERR "kvm: misc device register failed\n"); + goto out_unreg; + } + + register_syscore_ops(&kvm_syscore_ops); + + kvm_preempt_ops.sched_in = kvm_sched_in; + kvm_preempt_ops.sched_out = kvm_sched_out; + + kvm_init_debug(); + + return 0; + +out_unreg: + kvm_async_pf_deinit(); +out_free: + kmem_cache_destroy(kvm_vcpu_cache); +out_free_3: + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); +out_free_2: +out_free_1: + kvm_arch_hardware_unsetup(); +out_free_0a: + free_cpumask_var(cpus_hardware_enabled); +out_free_0: + if (fault_page) + __free_page(fault_page); + if (hwpoison_page) + __free_page(hwpoison_page); + __free_page(bad_page); +out: + kvm_arch_exit(); +out_fail: + return r; +} +EXPORT_SYMBOL_GPL(kvm_init); + +void kvm_exit(void) +{ + kvm_exit_debug(); + misc_deregister(&kvm_dev); + kmem_cache_destroy(kvm_vcpu_cache); + kvm_async_pf_deinit(); + unregister_syscore_ops(&kvm_syscore_ops); + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); + on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_arch_hardware_unsetup(); + kvm_arch_exit(); + free_cpumask_var(cpus_hardware_enabled); + __free_page(hwpoison_page); + __free_page(bad_page); +} +EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3