2 * Kernel-based Virtual Machine - device assignment support
4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
19 #include <linux/slab.h>
20 #include <linux/namei.h>
24 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
27 struct list_head *ptr;
28 struct kvm_assigned_dev_kernel *match;
30 list_for_each(ptr, head) {
31 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
32 if (match->assigned_dev_id == assigned_dev_id)
38 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
39 *assigned_dev, int irq)
42 struct msix_entry *host_msix_entries;
44 host_msix_entries = assigned_dev->host_msix_entries;
47 for (i = 0; i < assigned_dev->entries_nr; i++)
48 if (irq == host_msix_entries[i].vector) {
53 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
58 static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
62 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
63 spin_lock(&assigned_dev->intx_lock);
64 disable_irq_nosync(irq);
65 assigned_dev->host_irq_disabled = true;
66 spin_unlock(&assigned_dev->intx_lock);
69 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
70 assigned_dev->guest_irq, 1);
75 #ifdef __KVM_HAVE_MSIX
76 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
78 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
79 int index = find_index_from_host_irq(assigned_dev, irq);
83 vector = assigned_dev->guest_msix_entries[index].vector;
84 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
92 /* Ack the irq line for an assigned device */
93 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
95 struct kvm_assigned_dev_kernel *dev =
96 container_of(kian, struct kvm_assigned_dev_kernel,
99 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
101 /* The guest irq may be shared so this ack may be
102 * from another device.
104 spin_lock(&dev->intx_lock);
105 if (dev->host_irq_disabled) {
106 enable_irq(dev->host_irq);
107 dev->host_irq_disabled = false;
109 spin_unlock(&dev->intx_lock);
112 static void deassign_guest_irq(struct kvm *kvm,
113 struct kvm_assigned_dev_kernel *assigned_dev)
115 if (assigned_dev->ack_notifier.gsi != -1)
116 kvm_unregister_irq_ack_notifier(kvm,
117 &assigned_dev->ack_notifier);
119 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
120 assigned_dev->guest_irq, 0);
122 if (assigned_dev->irq_source_id != -1)
123 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
124 assigned_dev->irq_source_id = -1;
125 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
128 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
129 static void deassign_host_irq(struct kvm *kvm,
130 struct kvm_assigned_dev_kernel *assigned_dev)
133 * We disable irq here to prevent further events.
135 * Notice this maybe result in nested disable if the interrupt type is
136 * INTx, but it's OK for we are going to free it.
138 * If this function is a part of VM destroy, please ensure that till
139 * now, the kvm state is still legal for probably we also have to wait
140 * on a currently running IRQ handler.
142 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
144 for (i = 0; i < assigned_dev->entries_nr; i++)
145 disable_irq(assigned_dev->host_msix_entries[i].vector);
147 for (i = 0; i < assigned_dev->entries_nr; i++)
148 free_irq(assigned_dev->host_msix_entries[i].vector,
151 assigned_dev->entries_nr = 0;
152 kfree(assigned_dev->host_msix_entries);
153 kfree(assigned_dev->guest_msix_entries);
154 pci_disable_msix(assigned_dev->dev);
156 /* Deal with MSI and INTx */
157 disable_irq(assigned_dev->host_irq);
159 free_irq(assigned_dev->host_irq, assigned_dev);
161 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
162 pci_disable_msi(assigned_dev->dev);
165 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
168 static int kvm_deassign_irq(struct kvm *kvm,
169 struct kvm_assigned_dev_kernel *assigned_dev,
170 unsigned long irq_requested_type)
172 unsigned long guest_irq_type, host_irq_type;
174 if (!irqchip_in_kernel(kvm))
176 /* no irq assignment to deassign */
177 if (!assigned_dev->irq_requested_type)
180 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
181 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
184 deassign_host_irq(kvm, assigned_dev);
186 deassign_guest_irq(kvm, assigned_dev);
191 static void kvm_free_assigned_irq(struct kvm *kvm,
192 struct kvm_assigned_dev_kernel *assigned_dev)
194 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
197 static void kvm_free_assigned_device(struct kvm *kvm,
198 struct kvm_assigned_dev_kernel
201 kvm_free_assigned_irq(kvm, assigned_dev);
203 pci_reset_function(assigned_dev->dev);
204 if (pci_load_and_free_saved_state(assigned_dev->dev,
205 &assigned_dev->pci_saved_state))
206 printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
207 __func__, dev_name(&assigned_dev->dev->dev));
209 pci_restore_state(assigned_dev->dev);
211 assigned_dev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED;
213 pci_release_regions(assigned_dev->dev);
214 pci_disable_device(assigned_dev->dev);
215 pci_dev_put(assigned_dev->dev);
217 list_del(&assigned_dev->list);
221 void kvm_free_all_assigned_devices(struct kvm *kvm)
223 struct list_head *ptr, *ptr2;
224 struct kvm_assigned_dev_kernel *assigned_dev;
226 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
227 assigned_dev = list_entry(ptr,
228 struct kvm_assigned_dev_kernel,
231 kvm_free_assigned_device(kvm, assigned_dev);
235 static int assigned_device_enable_host_intx(struct kvm *kvm,
236 struct kvm_assigned_dev_kernel *dev)
238 dev->host_irq = dev->dev->irq;
239 /* Even though this is PCI, we don't want to use shared
240 * interrupts. Sharing host devices with guest-assigned devices
241 * on the same interrupt line is not a happy situation: there
242 * are going to be long delays in accepting, acking, etc.
244 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
245 IRQF_ONESHOT, dev->irq_name, dev))
250 #ifdef __KVM_HAVE_MSI
251 static int assigned_device_enable_host_msi(struct kvm *kvm,
252 struct kvm_assigned_dev_kernel *dev)
256 if (!dev->dev->msi_enabled) {
257 r = pci_enable_msi(dev->dev);
262 dev->host_irq = dev->dev->irq;
263 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
264 0, dev->irq_name, dev)) {
265 pci_disable_msi(dev->dev);
273 #ifdef __KVM_HAVE_MSIX
274 static int assigned_device_enable_host_msix(struct kvm *kvm,
275 struct kvm_assigned_dev_kernel *dev)
279 /* host_msix_entries and guest_msix_entries should have been
281 if (dev->entries_nr == 0)
284 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
288 for (i = 0; i < dev->entries_nr; i++) {
289 r = request_threaded_irq(dev->host_msix_entries[i].vector,
290 NULL, kvm_assigned_dev_thread_msix,
291 0, dev->irq_name, dev);
298 for (i -= 1; i >= 0; i--)
299 free_irq(dev->host_msix_entries[i].vector, dev);
300 pci_disable_msix(dev->dev);
306 static int assigned_device_enable_guest_intx(struct kvm *kvm,
307 struct kvm_assigned_dev_kernel *dev,
308 struct kvm_assigned_irq *irq)
310 dev->guest_irq = irq->guest_irq;
311 dev->ack_notifier.gsi = irq->guest_irq;
315 #ifdef __KVM_HAVE_MSI
316 static int assigned_device_enable_guest_msi(struct kvm *kvm,
317 struct kvm_assigned_dev_kernel *dev,
318 struct kvm_assigned_irq *irq)
320 dev->guest_irq = irq->guest_irq;
321 dev->ack_notifier.gsi = -1;
322 dev->host_irq_disabled = false;
327 #ifdef __KVM_HAVE_MSIX
328 static int assigned_device_enable_guest_msix(struct kvm *kvm,
329 struct kvm_assigned_dev_kernel *dev,
330 struct kvm_assigned_irq *irq)
332 dev->guest_irq = irq->guest_irq;
333 dev->ack_notifier.gsi = -1;
334 dev->host_irq_disabled = false;
339 static int assign_host_irq(struct kvm *kvm,
340 struct kvm_assigned_dev_kernel *dev,
345 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
348 snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
351 switch (host_irq_type) {
352 case KVM_DEV_IRQ_HOST_INTX:
353 r = assigned_device_enable_host_intx(kvm, dev);
355 #ifdef __KVM_HAVE_MSI
356 case KVM_DEV_IRQ_HOST_MSI:
357 r = assigned_device_enable_host_msi(kvm, dev);
360 #ifdef __KVM_HAVE_MSIX
361 case KVM_DEV_IRQ_HOST_MSIX:
362 r = assigned_device_enable_host_msix(kvm, dev);
370 dev->irq_requested_type |= host_irq_type;
375 static int assign_guest_irq(struct kvm *kvm,
376 struct kvm_assigned_dev_kernel *dev,
377 struct kvm_assigned_irq *irq,
378 unsigned long guest_irq_type)
383 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
386 id = kvm_request_irq_source_id(kvm);
390 dev->irq_source_id = id;
392 switch (guest_irq_type) {
393 case KVM_DEV_IRQ_GUEST_INTX:
394 r = assigned_device_enable_guest_intx(kvm, dev, irq);
396 #ifdef __KVM_HAVE_MSI
397 case KVM_DEV_IRQ_GUEST_MSI:
398 r = assigned_device_enable_guest_msi(kvm, dev, irq);
401 #ifdef __KVM_HAVE_MSIX
402 case KVM_DEV_IRQ_GUEST_MSIX:
403 r = assigned_device_enable_guest_msix(kvm, dev, irq);
411 dev->irq_requested_type |= guest_irq_type;
412 if (dev->ack_notifier.gsi != -1)
413 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
415 kvm_free_irq_source_id(kvm, dev->irq_source_id);
420 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
421 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
422 struct kvm_assigned_irq *assigned_irq)
425 struct kvm_assigned_dev_kernel *match;
426 unsigned long host_irq_type, guest_irq_type;
428 if (!irqchip_in_kernel(kvm))
431 mutex_lock(&kvm->lock);
433 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
434 assigned_irq->assigned_dev_id);
438 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
439 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
442 /* can only assign one type at a time */
443 if (hweight_long(host_irq_type) > 1)
445 if (hweight_long(guest_irq_type) > 1)
447 if (host_irq_type == 0 && guest_irq_type == 0)
452 r = assign_host_irq(kvm, match, host_irq_type);
457 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
459 mutex_unlock(&kvm->lock);
463 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
464 struct kvm_assigned_irq
468 struct kvm_assigned_dev_kernel *match;
470 mutex_lock(&kvm->lock);
472 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
473 assigned_irq->assigned_dev_id);
477 r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
479 mutex_unlock(&kvm->lock);
484 * We want to test whether the caller has been granted permissions to
485 * use this device. To be able to configure and control the device,
486 * the user needs access to PCI configuration space and BAR resources.
487 * These are accessed through PCI sysfs. PCI config space is often
488 * passed to the process calling this ioctl via file descriptor, so we
489 * can't rely on access to that file. We can check for permissions
490 * on each of the BAR resource files, which is a pretty clear
491 * indicator that the user has been granted access to the device.
493 static int probe_sysfs_permissions(struct pci_dev *dev)
497 bool bar_found = false;
499 for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
500 char *kpath, *syspath;
505 if (!pci_resource_len(dev, i))
508 kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
512 /* Per sysfs-rules, sysfs is always at /sys */
513 syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
518 r = kern_path(syspath, LOOKUP_FOLLOW, &path);
523 inode = path.dentry->d_inode;
525 r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
533 /* If no resources, probably something special */
539 return -EINVAL; /* No way to control the device without sysfs */
543 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
544 struct kvm_assigned_pci_dev *assigned_dev)
547 struct kvm_assigned_dev_kernel *match;
551 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
554 mutex_lock(&kvm->lock);
555 idx = srcu_read_lock(&kvm->srcu);
557 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
558 assigned_dev->assigned_dev_id);
560 /* device already assigned */
565 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
567 printk(KERN_INFO "%s: Couldn't allocate memory\n",
572 dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
574 assigned_dev->devfn);
576 printk(KERN_INFO "%s: host device not found\n", __func__);
581 /* Don't allow bridges to be assigned */
582 pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type);
583 if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) {
588 r = probe_sysfs_permissions(dev);
592 if (pci_enable_device(dev)) {
593 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
597 r = pci_request_regions(dev, "kvm_assigned_device");
599 printk(KERN_INFO "%s: Could not get access to device regions\n",
604 pci_reset_function(dev);
606 match->pci_saved_state = pci_store_saved_state(dev);
607 if (!match->pci_saved_state)
608 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
609 __func__, dev_name(&dev->dev));
610 match->assigned_dev_id = assigned_dev->assigned_dev_id;
611 match->host_segnr = assigned_dev->segnr;
612 match->host_busnr = assigned_dev->busnr;
613 match->host_devfn = assigned_dev->devfn;
614 match->flags = assigned_dev->flags;
616 spin_lock_init(&match->intx_lock);
617 match->irq_source_id = -1;
619 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
621 list_add(&match->list, &kvm->arch.assigned_dev_head);
623 if (!kvm->arch.iommu_domain) {
624 r = kvm_iommu_map_guest(kvm);
628 r = kvm_assign_device(kvm, match);
633 srcu_read_unlock(&kvm->srcu, idx);
634 mutex_unlock(&kvm->lock);
637 if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
638 printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
639 __func__, dev_name(&dev->dev));
640 list_del(&match->list);
641 pci_release_regions(dev);
643 pci_disable_device(dev);
648 srcu_read_unlock(&kvm->srcu, idx);
649 mutex_unlock(&kvm->lock);
653 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
654 struct kvm_assigned_pci_dev *assigned_dev)
657 struct kvm_assigned_dev_kernel *match;
659 mutex_lock(&kvm->lock);
661 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
662 assigned_dev->assigned_dev_id);
664 printk(KERN_INFO "%s: device hasn't been assigned before, "
665 "so cannot be deassigned\n", __func__);
670 kvm_deassign_device(kvm, match);
672 kvm_free_assigned_device(kvm, match);
675 mutex_unlock(&kvm->lock);
680 #ifdef __KVM_HAVE_MSIX
681 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
682 struct kvm_assigned_msix_nr *entry_nr)
685 struct kvm_assigned_dev_kernel *adev;
687 mutex_lock(&kvm->lock);
689 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
690 entry_nr->assigned_dev_id);
696 if (adev->entries_nr == 0) {
697 adev->entries_nr = entry_nr->entry_nr;
698 if (adev->entries_nr == 0 ||
699 adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
704 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
707 if (!adev->host_msix_entries) {
711 adev->guest_msix_entries =
712 kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
714 if (!adev->guest_msix_entries) {
715 kfree(adev->host_msix_entries);
719 } else /* Not allowed set MSI-X number twice */
722 mutex_unlock(&kvm->lock);
726 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
727 struct kvm_assigned_msix_entry *entry)
730 struct kvm_assigned_dev_kernel *adev;
732 mutex_lock(&kvm->lock);
734 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
735 entry->assigned_dev_id);
742 for (i = 0; i < adev->entries_nr; i++)
743 if (adev->guest_msix_entries[i].vector == 0 ||
744 adev->guest_msix_entries[i].entry == entry->entry) {
745 adev->guest_msix_entries[i].entry = entry->entry;
746 adev->guest_msix_entries[i].vector = entry->gsi;
747 adev->host_msix_entries[i].entry = entry->entry;
750 if (i == adev->entries_nr) {
756 mutex_unlock(&kvm->lock);
762 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
765 void __user *argp = (void __user *)arg;
769 case KVM_ASSIGN_PCI_DEVICE: {
770 struct kvm_assigned_pci_dev assigned_dev;
773 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
775 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
780 case KVM_ASSIGN_IRQ: {
784 case KVM_ASSIGN_DEV_IRQ: {
785 struct kvm_assigned_irq assigned_irq;
788 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
790 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
795 case KVM_DEASSIGN_DEV_IRQ: {
796 struct kvm_assigned_irq assigned_irq;
799 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
801 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
806 case KVM_DEASSIGN_PCI_DEVICE: {
807 struct kvm_assigned_pci_dev assigned_dev;
810 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
812 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
817 #ifdef KVM_CAP_IRQ_ROUTING
818 case KVM_SET_GSI_ROUTING: {
819 struct kvm_irq_routing routing;
820 struct kvm_irq_routing __user *urouting;
821 struct kvm_irq_routing_entry *entries;
824 if (copy_from_user(&routing, argp, sizeof(routing)))
827 if (routing.nr >= KVM_MAX_IRQ_ROUTES)
832 entries = vmalloc(routing.nr * sizeof(*entries));
837 if (copy_from_user(entries, urouting->entries,
838 routing.nr * sizeof(*entries)))
839 goto out_free_irq_routing;
840 r = kvm_set_irq_routing(kvm, entries, routing.nr,
842 out_free_irq_routing:
846 #endif /* KVM_CAP_IRQ_ROUTING */
847 #ifdef __KVM_HAVE_MSIX
848 case KVM_ASSIGN_SET_MSIX_NR: {
849 struct kvm_assigned_msix_nr entry_nr;
851 if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
853 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
858 case KVM_ASSIGN_SET_MSIX_ENTRY: {
859 struct kvm_assigned_msix_entry entry;
861 if (copy_from_user(&entry, argp, sizeof entry))
863 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);