2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
47 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 /* IO virtual address start page frame number */
75 #define IOVA_START_PFN (1)
77 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
78 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
79 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
81 /* page table handling */
82 #define LEVEL_STRIDE (9)
83 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
86 * This bitmap is used to advertise the page sizes our hardware support
87 * to the IOMMU core, which will then use this information to split
88 * physically contiguous memory regions it is mapping into page sizes
91 * Traditionally the IOMMU core just handed us the mappings directly,
92 * after making sure the size is an order of a 4KiB page and that the
93 * mapping has natural alignment.
95 * To retain this behavior, we currently advertise that we support
96 * all page sizes that are an order of 4KiB.
98 * If at some point we'd like to utilize the IOMMU core's new behavior,
99 * we could change this to advertise the real page sizes we support.
101 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
103 static inline int agaw_to_level(int agaw)
108 static inline int agaw_to_width(int agaw)
110 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
113 static inline int width_to_agaw(int width)
115 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
118 static inline unsigned int level_to_offset_bits(int level)
120 return (level - 1) * LEVEL_STRIDE;
123 static inline int pfn_level_offset(unsigned long pfn, int level)
125 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
128 static inline unsigned long level_mask(int level)
130 return -1UL << level_to_offset_bits(level);
133 static inline unsigned long level_size(int level)
135 return 1UL << level_to_offset_bits(level);
138 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 return (pfn + level_size(level) - 1) & level_mask(level);
143 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
148 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
149 are never going to work. */
150 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 return mm_to_dma_pfn(page_to_pfn(pg));
163 static inline unsigned long virt_to_dma_pfn(void *p)
165 return page_to_dma_pfn(virt_to_page(p));
168 /* global iommu list, set NULL for ignored DMAR units */
169 static struct intel_iommu **g_iommus;
171 static void __init check_tylersburg_isoch(void);
172 static int rwbf_quirk;
175 * set to 1 to panic kernel if can't successfully enable VT-d
176 * (used when kernel is launched w/ TXT)
178 static int force_on = 0;
183 * 12-63: Context Ptr (12 - (haw-1))
190 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
191 static inline bool root_present(struct root_entry *root)
193 return (root->val & 1);
195 static inline void set_root_present(struct root_entry *root)
199 static inline void set_root_value(struct root_entry *root, unsigned long value)
201 root->val &= ~VTD_PAGE_MASK;
202 root->val |= value & VTD_PAGE_MASK;
205 static inline struct context_entry *
206 get_context_addr_from_root(struct root_entry *root)
208 return (struct context_entry *)
209 (root_present(root)?phys_to_virt(
210 root->val & VTD_PAGE_MASK) :
217 * 1: fault processing disable
218 * 2-3: translation type
219 * 12-63: address space root
225 struct context_entry {
230 static inline bool context_present(struct context_entry *context)
232 return (context->lo & 1);
234 static inline void context_set_present(struct context_entry *context)
239 static inline void context_set_fault_enable(struct context_entry *context)
241 context->lo &= (((u64)-1) << 2) | 1;
244 static inline void context_set_translation_type(struct context_entry *context,
247 context->lo &= (((u64)-1) << 4) | 3;
248 context->lo |= (value & 3) << 2;
251 static inline void context_set_address_root(struct context_entry *context,
254 context->lo &= ~VTD_PAGE_MASK;
255 context->lo |= value & VTD_PAGE_MASK;
258 static inline void context_set_address_width(struct context_entry *context,
261 context->hi |= value & 7;
264 static inline void context_set_domain_id(struct context_entry *context,
267 context->hi |= (value & ((1 << 16) - 1)) << 8;
270 static inline void context_clear_entry(struct context_entry *context)
283 * 12-63: Host physcial address
289 static inline void dma_clear_pte(struct dma_pte *pte)
294 static inline u64 dma_pte_addr(struct dma_pte *pte)
297 return pte->val & VTD_PAGE_MASK;
299 /* Must have a full atomic 64-bit read */
300 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
304 static inline bool dma_pte_present(struct dma_pte *pte)
306 return (pte->val & 3) != 0;
309 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 return (pte->val & DMA_PTE_LARGE_PAGE);
314 static inline int first_pte_in_page(struct dma_pte *pte)
316 return !((unsigned long)pte & ~VTD_PAGE_MASK);
320 * This domain is a statically identity mapping domain.
321 * 1. This domain creats a static 1:1 mapping to all usable memory.
322 * 2. It maps to each iommu if successful.
323 * 3. Each iommu mapps to this domain if successful.
325 static struct dmar_domain *si_domain;
326 static int hw_pass_through = 1;
328 /* domain represents a virtual machine, more than one devices
329 * across iommus may be owned in one domain, e.g. kvm guest.
331 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
333 /* si_domain contains mulitple devices */
334 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
337 int id; /* domain id */
338 int nid; /* node id */
339 DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
340 /* bitmap of iommus this domain uses*/
342 struct list_head devices; /* all devices' list */
343 struct iova_domain iovad; /* iova's that belong to this domain */
345 struct dma_pte *pgd; /* virtual address */
346 int gaw; /* max guest address width */
348 /* adjusted guest address width, 0 is level 2 30-bit */
351 int flags; /* flags to find out type of domain */
353 int iommu_coherency;/* indicate coherency of iommu access */
354 int iommu_snooping; /* indicate snooping control feature*/
355 int iommu_count; /* reference count of iommu */
356 int iommu_superpage;/* Level of superpages supported:
357 0 == 4KiB (no superpages), 1 == 2MiB,
358 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
359 spinlock_t iommu_lock; /* protect iommu set in domain */
360 u64 max_addr; /* maximum mapped address */
362 struct iommu_domain domain; /* generic domain data structure for
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368 struct list_head link; /* link to domain siblings */
369 struct list_head global; /* link to global list */
370 u8 bus; /* PCI bus number */
371 u8 devfn; /* PCI devfn number */
372 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
373 struct intel_iommu *iommu; /* IOMMU used by this device */
374 struct dmar_domain *domain; /* pointer to domain */
377 struct dmar_rmrr_unit {
378 struct list_head list; /* list of rmrr units */
379 struct acpi_dmar_header *hdr; /* ACPI header */
380 u64 base_address; /* reserved base address*/
381 u64 end_address; /* reserved end address */
382 struct dmar_dev_scope *devices; /* target devices */
383 int devices_cnt; /* target device count */
386 struct dmar_atsr_unit {
387 struct list_head list; /* list of ATSR units */
388 struct acpi_dmar_header *hdr; /* ACPI header */
389 struct dmar_dev_scope *devices; /* target devices */
390 int devices_cnt; /* target device count */
391 u8 include_all:1; /* include all ports */
394 static LIST_HEAD(dmar_atsr_units);
395 static LIST_HEAD(dmar_rmrr_units);
397 #define for_each_rmrr_units(rmrr) \
398 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
400 static void flush_unmaps_timeout(unsigned long data);
402 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
404 #define HIGH_WATER_MARK 250
405 struct deferred_flush_tables {
407 struct iova *iova[HIGH_WATER_MARK];
408 struct dmar_domain *domain[HIGH_WATER_MARK];
409 struct page *freelist[HIGH_WATER_MARK];
412 static struct deferred_flush_tables *deferred_flush;
414 /* bitmap for indexing intel_iommus */
415 static int g_num_of_iommus;
417 static DEFINE_SPINLOCK(async_umap_flush_lock);
418 static LIST_HEAD(unmaps_to_do);
421 static long list_size;
423 static void domain_exit(struct dmar_domain *domain);
424 static void domain_remove_dev_info(struct dmar_domain *domain);
425 static void domain_remove_one_dev_info(struct dmar_domain *domain,
427 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
429 static int domain_detach_iommu(struct dmar_domain *domain,
430 struct intel_iommu *iommu);
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
453 static const struct iommu_ops intel_iommu_ops;
455 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
456 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
458 return container_of(dom, struct dmar_domain, domain);
461 static int __init intel_iommu_setup(char *str)
466 if (!strncmp(str, "on", 2)) {
468 printk(KERN_INFO "Intel-IOMMU: enabled\n");
469 } else if (!strncmp(str, "off", 3)) {
471 printk(KERN_INFO "Intel-IOMMU: disabled\n");
472 } else if (!strncmp(str, "igfx_off", 8)) {
475 "Intel-IOMMU: disable GFX device mapping\n");
476 } else if (!strncmp(str, "forcedac", 8)) {
478 "Intel-IOMMU: Forcing DAC for PCI devices\n");
480 } else if (!strncmp(str, "strict", 6)) {
482 "Intel-IOMMU: disable batched IOTLB flush\n");
483 intel_iommu_strict = 1;
484 } else if (!strncmp(str, "sp_off", 6)) {
486 "Intel-IOMMU: disable supported super page\n");
487 intel_iommu_superpage = 0;
490 str += strcspn(str, ",");
496 __setup("intel_iommu=", intel_iommu_setup);
498 static struct kmem_cache *iommu_domain_cache;
499 static struct kmem_cache *iommu_devinfo_cache;
501 static inline void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 static inline void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_vm(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
542 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
544 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
545 DOMAIN_FLAG_STATIC_IDENTITY);
548 static inline int domain_pfn_supported(struct dmar_domain *domain,
551 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
553 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
561 sagaw = cap_sagaw(iommu->cap);
562 for (agaw = width_to_agaw(max_gaw);
564 if (test_bit(agaw, &sagaw))
572 * Calculate max SAGAW for each iommu.
574 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
576 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
580 * calculate agaw for each iommu.
581 * "SAGAW" may be different across iommus, use a default agaw, and
582 * get a supported less agaw for iommus that don't support the default agaw.
584 int iommu_calculate_agaw(struct intel_iommu *iommu)
586 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
589 /* This functionin only returns single iommu in a domain */
590 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
594 /* si_domain and vm domain should not get here. */
595 BUG_ON(domain_type_is_vm_or_si(domain));
596 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
597 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
600 return g_iommus[iommu_id];
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
605 struct dmar_drhd_unit *drhd;
606 struct intel_iommu *iommu;
609 domain->iommu_coherency = 1;
611 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
613 if (!ecap_coherent(g_iommus[i]->ecap)) {
614 domain->iommu_coherency = 0;
621 /* No hardware attached; use lowest common denominator */
623 for_each_active_iommu(iommu, drhd) {
624 if (!ecap_coherent(iommu->ecap)) {
625 domain->iommu_coherency = 0;
632 static int domain_update_iommu_snooping(struct intel_iommu *skip)
634 struct dmar_drhd_unit *drhd;
635 struct intel_iommu *iommu;
639 for_each_active_iommu(iommu, drhd) {
641 if (!ecap_sc_support(iommu->ecap)) {
652 static int domain_update_iommu_superpage(struct intel_iommu *skip)
654 struct dmar_drhd_unit *drhd;
655 struct intel_iommu *iommu;
658 if (!intel_iommu_superpage) {
662 /* set iommu_superpage to the smallest common denominator */
664 for_each_active_iommu(iommu, drhd) {
666 mask &= cap_super_page_val(iommu->cap);
676 /* Some capabilities may be different across iommus */
677 static void domain_update_iommu_cap(struct dmar_domain *domain)
679 domain_update_iommu_coherency(domain);
680 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
681 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
684 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
686 struct dmar_drhd_unit *drhd = NULL;
687 struct intel_iommu *iommu;
689 struct pci_dev *ptmp, *pdev = NULL;
693 if (dev_is_pci(dev)) {
694 pdev = to_pci_dev(dev);
695 segment = pci_domain_nr(pdev->bus);
696 } else if (ACPI_COMPANION(dev))
697 dev = &ACPI_COMPANION(dev)->dev;
700 for_each_active_iommu(iommu, drhd) {
701 if (pdev && segment != drhd->segment)
704 for_each_active_dev_scope(drhd->devices,
705 drhd->devices_cnt, i, tmp) {
707 *bus = drhd->devices[i].bus;
708 *devfn = drhd->devices[i].devfn;
712 if (!pdev || !dev_is_pci(tmp))
715 ptmp = to_pci_dev(tmp);
716 if (ptmp->subordinate &&
717 ptmp->subordinate->number <= pdev->bus->number &&
718 ptmp->subordinate->busn_res.end >= pdev->bus->number)
722 if (pdev && drhd->include_all) {
724 *bus = pdev->bus->number;
725 *devfn = pdev->devfn;
736 static void domain_flush_cache(struct dmar_domain *domain,
737 void *addr, int size)
739 if (!domain->iommu_coherency)
740 clflush_cache_range(addr, size);
743 /* Gets context entry for a given bus and devfn */
744 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
747 struct root_entry *root;
748 struct context_entry *context;
749 unsigned long phy_addr;
752 spin_lock_irqsave(&iommu->lock, flags);
753 root = &iommu->root_entry[bus];
754 context = get_context_addr_from_root(root);
756 context = (struct context_entry *)
757 alloc_pgtable_page(iommu->node);
759 spin_unlock_irqrestore(&iommu->lock, flags);
762 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
763 phy_addr = virt_to_phys((void *)context);
764 set_root_value(root, phy_addr);
765 set_root_present(root);
766 __iommu_flush_cache(iommu, root, sizeof(*root));
768 spin_unlock_irqrestore(&iommu->lock, flags);
769 return &context[devfn];
772 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
774 struct root_entry *root;
775 struct context_entry *context;
779 spin_lock_irqsave(&iommu->lock, flags);
780 root = &iommu->root_entry[bus];
781 context = get_context_addr_from_root(root);
786 ret = context_present(&context[devfn]);
788 spin_unlock_irqrestore(&iommu->lock, flags);
792 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
794 struct root_entry *root;
795 struct context_entry *context;
798 spin_lock_irqsave(&iommu->lock, flags);
799 root = &iommu->root_entry[bus];
800 context = get_context_addr_from_root(root);
802 context_clear_entry(&context[devfn]);
803 __iommu_flush_cache(iommu, &context[devfn], \
806 spin_unlock_irqrestore(&iommu->lock, flags);
809 static void free_context_table(struct intel_iommu *iommu)
811 struct root_entry *root;
814 struct context_entry *context;
816 spin_lock_irqsave(&iommu->lock, flags);
817 if (!iommu->root_entry) {
820 for (i = 0; i < ROOT_ENTRY_NR; i++) {
821 root = &iommu->root_entry[i];
822 context = get_context_addr_from_root(root);
824 free_pgtable_page(context);
826 free_pgtable_page(iommu->root_entry);
827 iommu->root_entry = NULL;
829 spin_unlock_irqrestore(&iommu->lock, flags);
832 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
833 unsigned long pfn, int *target_level)
835 struct dma_pte *parent, *pte = NULL;
836 int level = agaw_to_level(domain->agaw);
839 BUG_ON(!domain->pgd);
841 if (!domain_pfn_supported(domain, pfn))
842 /* Address beyond IOMMU's addressing capabilities. */
845 parent = domain->pgd;
850 offset = pfn_level_offset(pfn, level);
851 pte = &parent[offset];
852 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
854 if (level == *target_level)
857 if (!dma_pte_present(pte)) {
860 tmp_page = alloc_pgtable_page(domain->nid);
865 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
866 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
867 if (cmpxchg64(&pte->val, 0ULL, pteval))
868 /* Someone else set it while we were thinking; use theirs. */
869 free_pgtable_page(tmp_page);
871 domain_flush_cache(domain, pte, sizeof(*pte));
876 parent = phys_to_virt(dma_pte_addr(pte));
881 *target_level = level;
887 /* return address's pte at specific level */
888 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
890 int level, int *large_page)
892 struct dma_pte *parent, *pte = NULL;
893 int total = agaw_to_level(domain->agaw);
896 parent = domain->pgd;
897 while (level <= total) {
898 offset = pfn_level_offset(pfn, total);
899 pte = &parent[offset];
903 if (!dma_pte_present(pte)) {
908 if (dma_pte_superpage(pte)) {
913 parent = phys_to_virt(dma_pte_addr(pte));
919 /* clear last level pte, a tlb flush should be followed */
920 static void dma_pte_clear_range(struct dmar_domain *domain,
921 unsigned long start_pfn,
922 unsigned long last_pfn)
924 unsigned int large_page = 1;
925 struct dma_pte *first_pte, *pte;
927 BUG_ON(!domain_pfn_supported(domain, start_pfn));
928 BUG_ON(!domain_pfn_supported(domain, last_pfn));
929 BUG_ON(start_pfn > last_pfn);
931 /* we don't need lock here; nobody else touches the iova range */
934 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
936 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
941 start_pfn += lvl_to_nr_pages(large_page);
943 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
945 domain_flush_cache(domain, first_pte,
946 (void *)pte - (void *)first_pte);
948 } while (start_pfn && start_pfn <= last_pfn);
951 static void dma_pte_free_level(struct dmar_domain *domain, int level,
952 struct dma_pte *pte, unsigned long pfn,
953 unsigned long start_pfn, unsigned long last_pfn)
955 pfn = max(start_pfn, pfn);
956 pte = &pte[pfn_level_offset(pfn, level)];
959 unsigned long level_pfn;
960 struct dma_pte *level_pte;
962 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
965 level_pfn = pfn & level_mask(level - 1);
966 level_pte = phys_to_virt(dma_pte_addr(pte));
969 dma_pte_free_level(domain, level - 1, level_pte,
970 level_pfn, start_pfn, last_pfn);
972 /* If range covers entire pagetable, free it */
973 if (!(start_pfn > level_pfn ||
974 last_pfn < level_pfn + level_size(level) - 1)) {
976 domain_flush_cache(domain, pte, sizeof(*pte));
977 free_pgtable_page(level_pte);
980 pfn += level_size(level);
981 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
984 /* free page table pages. last level pte should already be cleared */
985 static void dma_pte_free_pagetable(struct dmar_domain *domain,
986 unsigned long start_pfn,
987 unsigned long last_pfn)
989 BUG_ON(!domain_pfn_supported(domain, start_pfn));
990 BUG_ON(!domain_pfn_supported(domain, last_pfn));
991 BUG_ON(start_pfn > last_pfn);
993 dma_pte_clear_range(domain, start_pfn, last_pfn);
995 /* We don't need lock here; nobody else touches the iova range */
996 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
997 domain->pgd, 0, start_pfn, last_pfn);
1000 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1001 free_pgtable_page(domain->pgd);
1006 /* When a page at a given level is being unlinked from its parent, we don't
1007 need to *modify* it at all. All we need to do is make a list of all the
1008 pages which can be freed just as soon as we've flushed the IOTLB and we
1009 know the hardware page-walk will no longer touch them.
1010 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1012 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1013 int level, struct dma_pte *pte,
1014 struct page *freelist)
1018 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1019 pg->freelist = freelist;
1025 pte = page_address(pg);
1027 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1028 freelist = dma_pte_list_pagetables(domain, level - 1,
1031 } while (!first_pte_in_page(pte));
1036 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1037 struct dma_pte *pte, unsigned long pfn,
1038 unsigned long start_pfn,
1039 unsigned long last_pfn,
1040 struct page *freelist)
1042 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1044 pfn = max(start_pfn, pfn);
1045 pte = &pte[pfn_level_offset(pfn, level)];
1048 unsigned long level_pfn;
1050 if (!dma_pte_present(pte))
1053 level_pfn = pfn & level_mask(level);
1055 /* If range covers entire pagetable, free it */
1056 if (start_pfn <= level_pfn &&
1057 last_pfn >= level_pfn + level_size(level) - 1) {
1058 /* These suborbinate page tables are going away entirely. Don't
1059 bother to clear them; we're just going to *free* them. */
1060 if (level > 1 && !dma_pte_superpage(pte))
1061 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1067 } else if (level > 1) {
1068 /* Recurse down into a level that isn't *entirely* obsolete */
1069 freelist = dma_pte_clear_level(domain, level - 1,
1070 phys_to_virt(dma_pte_addr(pte)),
1071 level_pfn, start_pfn, last_pfn,
1075 pfn += level_size(level);
1076 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1079 domain_flush_cache(domain, first_pte,
1080 (void *)++last_pte - (void *)first_pte);
1085 /* We can't just free the pages because the IOMMU may still be walking
1086 the page tables, and may have cached the intermediate levels. The
1087 pages can only be freed after the IOTLB flush has been done. */
1088 struct page *domain_unmap(struct dmar_domain *domain,
1089 unsigned long start_pfn,
1090 unsigned long last_pfn)
1092 struct page *freelist = NULL;
1094 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1095 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1096 BUG_ON(start_pfn > last_pfn);
1098 /* we don't need lock here; nobody else touches the iova range */
1099 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1100 domain->pgd, 0, start_pfn, last_pfn, NULL);
1103 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1104 struct page *pgd_page = virt_to_page(domain->pgd);
1105 pgd_page->freelist = freelist;
1106 freelist = pgd_page;
1114 void dma_free_pagelist(struct page *freelist)
1118 while ((pg = freelist)) {
1119 freelist = pg->freelist;
1120 free_pgtable_page(page_address(pg));
1124 /* iommu handling */
1125 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1127 struct root_entry *root;
1128 unsigned long flags;
1130 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1132 pr_err("IOMMU: allocating root entry for %s failed\n",
1137 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1139 spin_lock_irqsave(&iommu->lock, flags);
1140 iommu->root_entry = root;
1141 spin_unlock_irqrestore(&iommu->lock, flags);
1146 static void iommu_set_root_entry(struct intel_iommu *iommu)
1152 addr = iommu->root_entry;
1154 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1155 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1157 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1159 /* Make sure hardware complete it */
1160 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1161 readl, (sts & DMA_GSTS_RTPS), sts);
1163 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1166 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1171 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1174 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1175 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1177 /* Make sure hardware complete it */
1178 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1179 readl, (!(val & DMA_GSTS_WBFS)), val);
1181 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1184 /* return value determine if we need a write buffer flush */
1185 static void __iommu_flush_context(struct intel_iommu *iommu,
1186 u16 did, u16 source_id, u8 function_mask,
1193 case DMA_CCMD_GLOBAL_INVL:
1194 val = DMA_CCMD_GLOBAL_INVL;
1196 case DMA_CCMD_DOMAIN_INVL:
1197 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1199 case DMA_CCMD_DEVICE_INVL:
1200 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1201 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1206 val |= DMA_CCMD_ICC;
1208 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1211 /* Make sure hardware complete it */
1212 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1213 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1215 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1218 /* return value determine if we need a write buffer flush */
1219 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1220 u64 addr, unsigned int size_order, u64 type)
1222 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1223 u64 val = 0, val_iva = 0;
1227 case DMA_TLB_GLOBAL_FLUSH:
1228 /* global flush doesn't need set IVA_REG */
1229 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1231 case DMA_TLB_DSI_FLUSH:
1232 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1234 case DMA_TLB_PSI_FLUSH:
1235 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1236 /* IH bit is passed in as part of address */
1237 val_iva = size_order | addr;
1242 /* Note: set drain read/write */
1245 * This is probably to be super secure.. Looks like we can
1246 * ignore it without any impact.
1248 if (cap_read_drain(iommu->cap))
1249 val |= DMA_TLB_READ_DRAIN;
1251 if (cap_write_drain(iommu->cap))
1252 val |= DMA_TLB_WRITE_DRAIN;
1254 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1255 /* Note: Only uses first TLB reg currently */
1257 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1258 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1260 /* Make sure hardware complete it */
1261 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1262 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1264 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1266 /* check IOTLB invalidation granularity */
1267 if (DMA_TLB_IAIG(val) == 0)
1268 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1269 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1270 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1271 (unsigned long long)DMA_TLB_IIRG(type),
1272 (unsigned long long)DMA_TLB_IAIG(val));
1275 static struct device_domain_info *
1276 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1280 unsigned long flags;
1281 struct device_domain_info *info;
1282 struct pci_dev *pdev;
1284 if (!ecap_dev_iotlb_support(iommu->ecap))
1290 spin_lock_irqsave(&device_domain_lock, flags);
1291 list_for_each_entry(info, &domain->devices, link)
1292 if (info->iommu == iommu && info->bus == bus &&
1293 info->devfn == devfn) {
1297 spin_unlock_irqrestore(&device_domain_lock, flags);
1299 if (!found || !info->dev || !dev_is_pci(info->dev))
1302 pdev = to_pci_dev(info->dev);
1304 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1307 if (!dmar_find_matched_atsr_unit(pdev))
1313 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1315 if (!info || !dev_is_pci(info->dev))
1318 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1321 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1323 if (!info->dev || !dev_is_pci(info->dev) ||
1324 !pci_ats_enabled(to_pci_dev(info->dev)))
1327 pci_disable_ats(to_pci_dev(info->dev));
1330 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1331 u64 addr, unsigned mask)
1334 unsigned long flags;
1335 struct device_domain_info *info;
1337 spin_lock_irqsave(&device_domain_lock, flags);
1338 list_for_each_entry(info, &domain->devices, link) {
1339 struct pci_dev *pdev;
1340 if (!info->dev || !dev_is_pci(info->dev))
1343 pdev = to_pci_dev(info->dev);
1344 if (!pci_ats_enabled(pdev))
1347 sid = info->bus << 8 | info->devfn;
1348 qdep = pci_ats_queue_depth(pdev);
1349 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1351 spin_unlock_irqrestore(&device_domain_lock, flags);
1354 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1355 unsigned long pfn, unsigned int pages, int ih, int map)
1357 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1358 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1365 * Fallback to domain selective flush if no PSI support or the size is
1367 * PSI requires page size to be 2 ^ x, and the base address is naturally
1368 * aligned to the size
1370 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1371 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1374 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1378 * In caching mode, changes of pages from non-present to present require
1379 * flush. However, device IOTLB doesn't need to be flushed in this case.
1381 if (!cap_caching_mode(iommu->cap) || !map)
1382 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1385 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1388 unsigned long flags;
1390 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1391 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1392 pmen &= ~DMA_PMEN_EPM;
1393 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1395 /* wait for the protected region status bit to clear */
1396 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1397 readl, !(pmen & DMA_PMEN_PRS), pmen);
1399 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1402 static void iommu_enable_translation(struct intel_iommu *iommu)
1405 unsigned long flags;
1407 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1408 iommu->gcmd |= DMA_GCMD_TE;
1409 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1411 /* Make sure hardware complete it */
1412 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1413 readl, (sts & DMA_GSTS_TES), sts);
1415 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1418 static void iommu_disable_translation(struct intel_iommu *iommu)
1423 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1424 iommu->gcmd &= ~DMA_GCMD_TE;
1425 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1427 /* Make sure hardware complete it */
1428 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1429 readl, (!(sts & DMA_GSTS_TES)), sts);
1431 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1435 static int iommu_init_domains(struct intel_iommu *iommu)
1437 unsigned long ndomains;
1438 unsigned long nlongs;
1440 ndomains = cap_ndoms(iommu->cap);
1441 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1442 iommu->seq_id, ndomains);
1443 nlongs = BITS_TO_LONGS(ndomains);
1445 spin_lock_init(&iommu->lock);
1447 /* TBD: there might be 64K domains,
1448 * consider other allocation for future chip
1450 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1451 if (!iommu->domain_ids) {
1452 pr_err("IOMMU%d: allocating domain id array failed\n",
1456 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1458 if (!iommu->domains) {
1459 pr_err("IOMMU%d: allocating domain array failed\n",
1461 kfree(iommu->domain_ids);
1462 iommu->domain_ids = NULL;
1467 * if Caching mode is set, then invalid translations are tagged
1468 * with domainid 0. Hence we need to pre-allocate it.
1470 if (cap_caching_mode(iommu->cap))
1471 set_bit(0, iommu->domain_ids);
1475 static void disable_dmar_iommu(struct intel_iommu *iommu)
1477 struct dmar_domain *domain;
1480 if ((iommu->domains) && (iommu->domain_ids)) {
1481 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1483 * Domain id 0 is reserved for invalid translation
1484 * if hardware supports caching mode.
1486 if (cap_caching_mode(iommu->cap) && i == 0)
1489 domain = iommu->domains[i];
1490 clear_bit(i, iommu->domain_ids);
1491 if (domain_detach_iommu(domain, iommu) == 0 &&
1492 !domain_type_is_vm(domain))
1493 domain_exit(domain);
1497 if (iommu->gcmd & DMA_GCMD_TE)
1498 iommu_disable_translation(iommu);
1501 static void free_dmar_iommu(struct intel_iommu *iommu)
1503 if ((iommu->domains) && (iommu->domain_ids)) {
1504 kfree(iommu->domains);
1505 kfree(iommu->domain_ids);
1506 iommu->domains = NULL;
1507 iommu->domain_ids = NULL;
1510 g_iommus[iommu->seq_id] = NULL;
1512 /* free context mapping */
1513 free_context_table(iommu);
1516 static struct dmar_domain *alloc_domain(int flags)
1518 /* domain id for virtual machine, it won't be set in context */
1519 static atomic_t vm_domid = ATOMIC_INIT(0);
1520 struct dmar_domain *domain;
1522 domain = alloc_domain_mem();
1526 memset(domain, 0, sizeof(*domain));
1528 domain->flags = flags;
1529 spin_lock_init(&domain->iommu_lock);
1530 INIT_LIST_HEAD(&domain->devices);
1531 if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1532 domain->id = atomic_inc_return(&vm_domid);
1537 static int __iommu_attach_domain(struct dmar_domain *domain,
1538 struct intel_iommu *iommu)
1541 unsigned long ndomains;
1543 ndomains = cap_ndoms(iommu->cap);
1544 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1545 if (num < ndomains) {
1546 set_bit(num, iommu->domain_ids);
1547 iommu->domains[num] = domain;
1555 static int iommu_attach_domain(struct dmar_domain *domain,
1556 struct intel_iommu *iommu)
1559 unsigned long flags;
1561 spin_lock_irqsave(&iommu->lock, flags);
1562 num = __iommu_attach_domain(domain, iommu);
1563 spin_unlock_irqrestore(&iommu->lock, flags);
1565 pr_err("IOMMU: no free domain ids\n");
1570 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1571 struct intel_iommu *iommu)
1574 unsigned long ndomains;
1576 ndomains = cap_ndoms(iommu->cap);
1577 for_each_set_bit(num, iommu->domain_ids, ndomains)
1578 if (iommu->domains[num] == domain)
1581 return __iommu_attach_domain(domain, iommu);
1584 static void iommu_detach_domain(struct dmar_domain *domain,
1585 struct intel_iommu *iommu)
1587 unsigned long flags;
1590 spin_lock_irqsave(&iommu->lock, flags);
1591 if (domain_type_is_vm_or_si(domain)) {
1592 ndomains = cap_ndoms(iommu->cap);
1593 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1594 if (iommu->domains[num] == domain) {
1595 clear_bit(num, iommu->domain_ids);
1596 iommu->domains[num] = NULL;
1601 clear_bit(domain->id, iommu->domain_ids);
1602 iommu->domains[domain->id] = NULL;
1604 spin_unlock_irqrestore(&iommu->lock, flags);
1607 static void domain_attach_iommu(struct dmar_domain *domain,
1608 struct intel_iommu *iommu)
1610 unsigned long flags;
1612 spin_lock_irqsave(&domain->iommu_lock, flags);
1613 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1614 domain->iommu_count++;
1615 if (domain->iommu_count == 1)
1616 domain->nid = iommu->node;
1617 domain_update_iommu_cap(domain);
1619 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1622 static int domain_detach_iommu(struct dmar_domain *domain,
1623 struct intel_iommu *iommu)
1625 unsigned long flags;
1626 int count = INT_MAX;
1628 spin_lock_irqsave(&domain->iommu_lock, flags);
1629 if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1630 count = --domain->iommu_count;
1631 domain_update_iommu_cap(domain);
1633 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1638 static struct iova_domain reserved_iova_list;
1639 static struct lock_class_key reserved_rbtree_key;
1641 static int dmar_init_reserved_ranges(void)
1643 struct pci_dev *pdev = NULL;
1647 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1650 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1651 &reserved_rbtree_key);
1653 /* IOAPIC ranges shouldn't be accessed by DMA */
1654 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1655 IOVA_PFN(IOAPIC_RANGE_END));
1657 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1661 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1662 for_each_pci_dev(pdev) {
1665 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1666 r = &pdev->resource[i];
1667 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1669 iova = reserve_iova(&reserved_iova_list,
1673 printk(KERN_ERR "Reserve iova failed\n");
1681 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1683 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1686 static inline int guestwidth_to_adjustwidth(int gaw)
1689 int r = (gaw - 12) % 9;
1700 static int domain_init(struct dmar_domain *domain, int guest_width)
1702 struct intel_iommu *iommu;
1703 int adjust_width, agaw;
1704 unsigned long sagaw;
1706 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1708 domain_reserve_special_ranges(domain);
1710 /* calculate AGAW */
1711 iommu = domain_get_iommu(domain);
1712 if (guest_width > cap_mgaw(iommu->cap))
1713 guest_width = cap_mgaw(iommu->cap);
1714 domain->gaw = guest_width;
1715 adjust_width = guestwidth_to_adjustwidth(guest_width);
1716 agaw = width_to_agaw(adjust_width);
1717 sagaw = cap_sagaw(iommu->cap);
1718 if (!test_bit(agaw, &sagaw)) {
1719 /* hardware doesn't support it, choose a bigger one */
1720 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1721 agaw = find_next_bit(&sagaw, 5, agaw);
1725 domain->agaw = agaw;
1727 if (ecap_coherent(iommu->ecap))
1728 domain->iommu_coherency = 1;
1730 domain->iommu_coherency = 0;
1732 if (ecap_sc_support(iommu->ecap))
1733 domain->iommu_snooping = 1;
1735 domain->iommu_snooping = 0;
1737 if (intel_iommu_superpage)
1738 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1740 domain->iommu_superpage = 0;
1742 domain->nid = iommu->node;
1744 /* always allocate the top pgd */
1745 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1748 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1752 static void domain_exit(struct dmar_domain *domain)
1754 struct dmar_drhd_unit *drhd;
1755 struct intel_iommu *iommu;
1756 struct page *freelist = NULL;
1758 /* Domain 0 is reserved, so dont process it */
1762 /* Flush any lazy unmaps that may reference this domain */
1763 if (!intel_iommu_strict)
1764 flush_unmaps_timeout(0);
1766 /* remove associated devices */
1767 domain_remove_dev_info(domain);
1770 put_iova_domain(&domain->iovad);
1772 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1774 /* clear attached or cached domains */
1776 for_each_active_iommu(iommu, drhd)
1777 iommu_detach_domain(domain, iommu);
1780 dma_free_pagelist(freelist);
1782 free_domain_mem(domain);
1785 static int domain_context_mapping_one(struct dmar_domain *domain,
1786 struct intel_iommu *iommu,
1787 u8 bus, u8 devfn, int translation)
1789 struct context_entry *context;
1790 unsigned long flags;
1791 struct dma_pte *pgd;
1794 struct device_domain_info *info = NULL;
1796 pr_debug("Set context mapping for %02x:%02x.%d\n",
1797 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1799 BUG_ON(!domain->pgd);
1800 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1801 translation != CONTEXT_TT_MULTI_LEVEL);
1803 context = device_to_context_entry(iommu, bus, devfn);
1806 spin_lock_irqsave(&iommu->lock, flags);
1807 if (context_present(context)) {
1808 spin_unlock_irqrestore(&iommu->lock, flags);
1815 if (domain_type_is_vm_or_si(domain)) {
1816 if (domain_type_is_vm(domain)) {
1817 id = iommu_attach_vm_domain(domain, iommu);
1819 spin_unlock_irqrestore(&iommu->lock, flags);
1820 pr_err("IOMMU: no free domain ids\n");
1825 /* Skip top levels of page tables for
1826 * iommu which has less agaw than default.
1827 * Unnecessary for PT mode.
1829 if (translation != CONTEXT_TT_PASS_THROUGH) {
1830 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1831 pgd = phys_to_virt(dma_pte_addr(pgd));
1832 if (!dma_pte_present(pgd)) {
1833 spin_unlock_irqrestore(&iommu->lock, flags);
1840 context_set_domain_id(context, id);
1842 if (translation != CONTEXT_TT_PASS_THROUGH) {
1843 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1844 translation = info ? CONTEXT_TT_DEV_IOTLB :
1845 CONTEXT_TT_MULTI_LEVEL;
1848 * In pass through mode, AW must be programmed to indicate the largest
1849 * AGAW value supported by hardware. And ASR is ignored by hardware.
1851 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1852 context_set_address_width(context, iommu->msagaw);
1854 context_set_address_root(context, virt_to_phys(pgd));
1855 context_set_address_width(context, iommu->agaw);
1858 context_set_translation_type(context, translation);
1859 context_set_fault_enable(context);
1860 context_set_present(context);
1861 domain_flush_cache(domain, context, sizeof(*context));
1864 * It's a non-present to present mapping. If hardware doesn't cache
1865 * non-present entry we only need to flush the write-buffer. If the
1866 * _does_ cache non-present entries, then it does so in the special
1867 * domain #0, which we have to flush:
1869 if (cap_caching_mode(iommu->cap)) {
1870 iommu->flush.flush_context(iommu, 0,
1871 (((u16)bus) << 8) | devfn,
1872 DMA_CCMD_MASK_NOBIT,
1873 DMA_CCMD_DEVICE_INVL);
1874 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1876 iommu_flush_write_buffer(iommu);
1878 iommu_enable_dev_iotlb(info);
1879 spin_unlock_irqrestore(&iommu->lock, flags);
1881 domain_attach_iommu(domain, iommu);
1886 struct domain_context_mapping_data {
1887 struct dmar_domain *domain;
1888 struct intel_iommu *iommu;
1892 static int domain_context_mapping_cb(struct pci_dev *pdev,
1893 u16 alias, void *opaque)
1895 struct domain_context_mapping_data *data = opaque;
1897 return domain_context_mapping_one(data->domain, data->iommu,
1898 PCI_BUS_NUM(alias), alias & 0xff,
1903 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1906 struct intel_iommu *iommu;
1908 struct domain_context_mapping_data data;
1910 iommu = device_to_iommu(dev, &bus, &devfn);
1914 if (!dev_is_pci(dev))
1915 return domain_context_mapping_one(domain, iommu, bus, devfn,
1918 data.domain = domain;
1920 data.translation = translation;
1922 return pci_for_each_dma_alias(to_pci_dev(dev),
1923 &domain_context_mapping_cb, &data);
1926 static int domain_context_mapped_cb(struct pci_dev *pdev,
1927 u16 alias, void *opaque)
1929 struct intel_iommu *iommu = opaque;
1931 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1934 static int domain_context_mapped(struct device *dev)
1936 struct intel_iommu *iommu;
1939 iommu = device_to_iommu(dev, &bus, &devfn);
1943 if (!dev_is_pci(dev))
1944 return device_context_mapped(iommu, bus, devfn);
1946 return !pci_for_each_dma_alias(to_pci_dev(dev),
1947 domain_context_mapped_cb, iommu);
1950 /* Returns a number of VTD pages, but aligned to MM page size */
1951 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1954 host_addr &= ~PAGE_MASK;
1955 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1958 /* Return largest possible superpage level for a given mapping */
1959 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1960 unsigned long iov_pfn,
1961 unsigned long phy_pfn,
1962 unsigned long pages)
1964 int support, level = 1;
1965 unsigned long pfnmerge;
1967 support = domain->iommu_superpage;
1969 /* To use a large page, the virtual *and* physical addresses
1970 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1971 of them will mean we have to use smaller pages. So just
1972 merge them and check both at once. */
1973 pfnmerge = iov_pfn | phy_pfn;
1975 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1976 pages >>= VTD_STRIDE_SHIFT;
1979 pfnmerge >>= VTD_STRIDE_SHIFT;
1986 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1987 struct scatterlist *sg, unsigned long phys_pfn,
1988 unsigned long nr_pages, int prot)
1990 struct dma_pte *first_pte = NULL, *pte = NULL;
1991 phys_addr_t uninitialized_var(pteval);
1992 unsigned long sg_res = 0;
1993 unsigned int largepage_lvl = 0;
1994 unsigned long lvl_pages = 0;
1996 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
1998 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2001 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2005 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2008 while (nr_pages > 0) {
2012 sg_res = aligned_nrpages(sg->offset, sg->length);
2013 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2014 sg->dma_length = sg->length;
2015 pteval = page_to_phys(sg_page(sg)) | prot;
2016 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2020 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2022 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2025 /* It is large page*/
2026 if (largepage_lvl > 1) {
2027 pteval |= DMA_PTE_LARGE_PAGE;
2028 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2030 * Ensure that old small page tables are
2031 * removed to make room for superpage,
2034 dma_pte_free_pagetable(domain, iov_pfn,
2035 iov_pfn + lvl_pages - 1);
2037 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2041 /* We don't need lock here, nobody else
2042 * touches the iova range
2044 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2046 static int dumps = 5;
2047 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2048 iov_pfn, tmp, (unsigned long long)pteval);
2051 debug_dma_dump_mappings(NULL);
2056 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2058 BUG_ON(nr_pages < lvl_pages);
2059 BUG_ON(sg_res < lvl_pages);
2061 nr_pages -= lvl_pages;
2062 iov_pfn += lvl_pages;
2063 phys_pfn += lvl_pages;
2064 pteval += lvl_pages * VTD_PAGE_SIZE;
2065 sg_res -= lvl_pages;
2067 /* If the next PTE would be the first in a new page, then we
2068 need to flush the cache on the entries we've just written.
2069 And then we'll need to recalculate 'pte', so clear it and
2070 let it get set again in the if (!pte) block above.
2072 If we're done (!nr_pages) we need to flush the cache too.
2074 Also if we've been setting superpages, we may need to
2075 recalculate 'pte' and switch back to smaller pages for the
2076 end of the mapping, if the trailing size is not enough to
2077 use another superpage (i.e. sg_res < lvl_pages). */
2079 if (!nr_pages || first_pte_in_page(pte) ||
2080 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2081 domain_flush_cache(domain, first_pte,
2082 (void *)pte - (void *)first_pte);
2086 if (!sg_res && nr_pages)
2092 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2093 struct scatterlist *sg, unsigned long nr_pages,
2096 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2099 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2100 unsigned long phys_pfn, unsigned long nr_pages,
2103 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2106 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2111 clear_context_table(iommu, bus, devfn);
2112 iommu->flush.flush_context(iommu, 0, 0, 0,
2113 DMA_CCMD_GLOBAL_INVL);
2114 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2117 static inline void unlink_domain_info(struct device_domain_info *info)
2119 assert_spin_locked(&device_domain_lock);
2120 list_del(&info->link);
2121 list_del(&info->global);
2123 info->dev->archdata.iommu = NULL;
2126 static void domain_remove_dev_info(struct dmar_domain *domain)
2128 struct device_domain_info *info, *tmp;
2129 unsigned long flags;
2131 spin_lock_irqsave(&device_domain_lock, flags);
2132 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2133 unlink_domain_info(info);
2134 spin_unlock_irqrestore(&device_domain_lock, flags);
2136 iommu_disable_dev_iotlb(info);
2137 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2139 if (domain_type_is_vm(domain)) {
2140 iommu_detach_dependent_devices(info->iommu, info->dev);
2141 domain_detach_iommu(domain, info->iommu);
2144 free_devinfo_mem(info);
2145 spin_lock_irqsave(&device_domain_lock, flags);
2147 spin_unlock_irqrestore(&device_domain_lock, flags);
2152 * Note: we use struct device->archdata.iommu stores the info
2154 static struct dmar_domain *find_domain(struct device *dev)
2156 struct device_domain_info *info;
2158 /* No lock here, assumes no domain exit in normal case */
2159 info = dev->archdata.iommu;
2161 return info->domain;
2165 static inline struct device_domain_info *
2166 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2168 struct device_domain_info *info;
2170 list_for_each_entry(info, &device_domain_list, global)
2171 if (info->iommu->segment == segment && info->bus == bus &&
2172 info->devfn == devfn)
2178 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2181 struct dmar_domain *domain)
2183 struct dmar_domain *found = NULL;
2184 struct device_domain_info *info;
2185 unsigned long flags;
2187 info = alloc_devinfo_mem();
2192 info->devfn = devfn;
2194 info->domain = domain;
2195 info->iommu = iommu;
2197 spin_lock_irqsave(&device_domain_lock, flags);
2199 found = find_domain(dev);
2201 struct device_domain_info *info2;
2202 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2204 found = info2->domain;
2207 spin_unlock_irqrestore(&device_domain_lock, flags);
2208 free_devinfo_mem(info);
2209 /* Caller must free the original domain */
2213 list_add(&info->link, &domain->devices);
2214 list_add(&info->global, &device_domain_list);
2216 dev->archdata.iommu = info;
2217 spin_unlock_irqrestore(&device_domain_lock, flags);
2222 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2224 *(u16 *)opaque = alias;
2228 /* domain is initialized */
2229 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2231 struct dmar_domain *domain, *tmp;
2232 struct intel_iommu *iommu;
2233 struct device_domain_info *info;
2235 unsigned long flags;
2238 domain = find_domain(dev);
2242 iommu = device_to_iommu(dev, &bus, &devfn);
2246 if (dev_is_pci(dev)) {
2247 struct pci_dev *pdev = to_pci_dev(dev);
2249 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2251 spin_lock_irqsave(&device_domain_lock, flags);
2252 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2253 PCI_BUS_NUM(dma_alias),
2256 iommu = info->iommu;
2257 domain = info->domain;
2259 spin_unlock_irqrestore(&device_domain_lock, flags);
2261 /* DMA alias already has a domain, uses it */
2266 /* Allocate and initialize new domain for the device */
2267 domain = alloc_domain(0);
2270 domain->id = iommu_attach_domain(domain, iommu);
2271 if (domain->id < 0) {
2272 free_domain_mem(domain);
2275 domain_attach_iommu(domain, iommu);
2276 if (domain_init(domain, gaw)) {
2277 domain_exit(domain);
2281 /* register PCI DMA alias device */
2282 if (dev_is_pci(dev)) {
2283 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2284 dma_alias & 0xff, NULL, domain);
2286 if (!tmp || tmp != domain) {
2287 domain_exit(domain);
2296 tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2298 if (!tmp || tmp != domain) {
2299 domain_exit(domain);
2306 static int iommu_identity_mapping;
2307 #define IDENTMAP_ALL 1
2308 #define IDENTMAP_GFX 2
2309 #define IDENTMAP_AZALIA 4
2311 static int iommu_domain_identity_map(struct dmar_domain *domain,
2312 unsigned long long start,
2313 unsigned long long end)
2315 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2316 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2318 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2319 dma_to_mm_pfn(last_vpfn))) {
2320 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2324 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2325 start, end, domain->id);
2327 * RMRR range might have overlap with physical memory range,
2330 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2332 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2333 last_vpfn - first_vpfn + 1,
2334 DMA_PTE_READ|DMA_PTE_WRITE);
2337 static int iommu_prepare_identity_map(struct device *dev,
2338 unsigned long long start,
2339 unsigned long long end)
2341 struct dmar_domain *domain;
2344 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2348 /* For _hardware_ passthrough, don't bother. But for software
2349 passthrough, we do it anyway -- it may indicate a memory
2350 range which is reserved in E820, so which didn't get set
2351 up to start with in si_domain */
2352 if (domain == si_domain && hw_pass_through) {
2353 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2354 dev_name(dev), start, end);
2359 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2360 dev_name(dev), start, end);
2363 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2364 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2365 dmi_get_system_info(DMI_BIOS_VENDOR),
2366 dmi_get_system_info(DMI_BIOS_VERSION),
2367 dmi_get_system_info(DMI_PRODUCT_VERSION));
2372 if (end >> agaw_to_width(domain->agaw)) {
2373 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2374 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2375 agaw_to_width(domain->agaw),
2376 dmi_get_system_info(DMI_BIOS_VENDOR),
2377 dmi_get_system_info(DMI_BIOS_VERSION),
2378 dmi_get_system_info(DMI_PRODUCT_VERSION));
2383 ret = iommu_domain_identity_map(domain, start, end);
2387 /* context entry init */
2388 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2395 domain_exit(domain);
2399 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2402 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2404 return iommu_prepare_identity_map(dev, rmrr->base_address,
2408 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2409 static inline void iommu_prepare_isa(void)
2411 struct pci_dev *pdev;
2414 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2418 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2419 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2422 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2423 "floppy might not work\n");
2428 static inline void iommu_prepare_isa(void)
2432 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2434 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2436 static int __init si_domain_init(int hw)
2438 struct dmar_drhd_unit *drhd;
2439 struct intel_iommu *iommu;
2443 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2447 for_each_active_iommu(iommu, drhd) {
2448 ret = iommu_attach_domain(si_domain, iommu);
2450 domain_exit(si_domain);
2453 si_domain->id = ret;
2455 } else if (si_domain->id != ret) {
2456 domain_exit(si_domain);
2459 domain_attach_iommu(si_domain, iommu);
2462 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2463 domain_exit(si_domain);
2467 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2473 for_each_online_node(nid) {
2474 unsigned long start_pfn, end_pfn;
2477 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2478 ret = iommu_domain_identity_map(si_domain,
2479 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2488 static int identity_mapping(struct device *dev)
2490 struct device_domain_info *info;
2492 if (likely(!iommu_identity_mapping))
2495 info = dev->archdata.iommu;
2496 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2497 return (info->domain == si_domain);
2502 static int domain_add_dev_info(struct dmar_domain *domain,
2503 struct device *dev, int translation)
2505 struct dmar_domain *ndomain;
2506 struct intel_iommu *iommu;
2510 iommu = device_to_iommu(dev, &bus, &devfn);
2514 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2515 if (ndomain != domain)
2518 ret = domain_context_mapping(domain, dev, translation);
2520 domain_remove_one_dev_info(domain, dev);
2527 static bool device_has_rmrr(struct device *dev)
2529 struct dmar_rmrr_unit *rmrr;
2534 for_each_rmrr_units(rmrr) {
2536 * Return TRUE if this RMRR contains the device that
2539 for_each_active_dev_scope(rmrr->devices,
2540 rmrr->devices_cnt, i, tmp)
2551 * There are a couple cases where we need to restrict the functionality of
2552 * devices associated with RMRRs. The first is when evaluating a device for
2553 * identity mapping because problems exist when devices are moved in and out
2554 * of domains and their respective RMRR information is lost. This means that
2555 * a device with associated RMRRs will never be in a "passthrough" domain.
2556 * The second is use of the device through the IOMMU API. This interface
2557 * expects to have full control of the IOVA space for the device. We cannot
2558 * satisfy both the requirement that RMRR access is maintained and have an
2559 * unencumbered IOVA space. We also have no ability to quiesce the device's
2560 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2561 * We therefore prevent devices associated with an RMRR from participating in
2562 * the IOMMU API, which eliminates them from device assignment.
2564 * In both cases we assume that PCI USB devices with RMRRs have them largely
2565 * for historical reasons and that the RMRR space is not actively used post
2566 * boot. This exclusion may change if vendors begin to abuse it.
2568 static bool device_is_rmrr_locked(struct device *dev)
2570 if (!device_has_rmrr(dev))
2573 if (dev_is_pci(dev)) {
2574 struct pci_dev *pdev = to_pci_dev(dev);
2576 if ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
2583 static int iommu_should_identity_map(struct device *dev, int startup)
2586 if (dev_is_pci(dev)) {
2587 struct pci_dev *pdev = to_pci_dev(dev);
2589 if (device_is_rmrr_locked(dev))
2592 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2595 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2598 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2602 * We want to start off with all devices in the 1:1 domain, and
2603 * take them out later if we find they can't access all of memory.
2605 * However, we can't do this for PCI devices behind bridges,
2606 * because all PCI devices behind the same bridge will end up
2607 * with the same source-id on their transactions.
2609 * Practically speaking, we can't change things around for these
2610 * devices at run-time, because we can't be sure there'll be no
2611 * DMA transactions in flight for any of their siblings.
2613 * So PCI devices (unless they're on the root bus) as well as
2614 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2615 * the 1:1 domain, just in _case_ one of their siblings turns out
2616 * not to be able to map all of memory.
2618 if (!pci_is_pcie(pdev)) {
2619 if (!pci_is_root_bus(pdev->bus))
2621 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2623 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2626 if (device_has_rmrr(dev))
2631 * At boot time, we don't yet know if devices will be 64-bit capable.
2632 * Assume that they will — if they turn out not to be, then we can
2633 * take them out of the 1:1 domain later.
2637 * If the device's dma_mask is less than the system's memory
2638 * size then this is not a candidate for identity mapping.
2640 u64 dma_mask = *dev->dma_mask;
2642 if (dev->coherent_dma_mask &&
2643 dev->coherent_dma_mask < dma_mask)
2644 dma_mask = dev->coherent_dma_mask;
2646 return dma_mask >= dma_get_required_mask(dev);
2652 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2656 if (!iommu_should_identity_map(dev, 1))
2659 ret = domain_add_dev_info(si_domain, dev,
2660 hw ? CONTEXT_TT_PASS_THROUGH :
2661 CONTEXT_TT_MULTI_LEVEL);
2663 pr_info("IOMMU: %s identity mapping for device %s\n",
2664 hw ? "hardware" : "software", dev_name(dev));
2665 else if (ret == -ENODEV)
2666 /* device not associated with an iommu */
2673 static int __init iommu_prepare_static_identity_mapping(int hw)
2675 struct pci_dev *pdev = NULL;
2676 struct dmar_drhd_unit *drhd;
2677 struct intel_iommu *iommu;
2682 ret = si_domain_init(hw);
2686 for_each_pci_dev(pdev) {
2687 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2692 for_each_active_iommu(iommu, drhd)
2693 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2694 struct acpi_device_physical_node *pn;
2695 struct acpi_device *adev;
2697 if (dev->bus != &acpi_bus_type)
2700 adev= to_acpi_device(dev);
2701 mutex_lock(&adev->physical_node_lock);
2702 list_for_each_entry(pn, &adev->physical_node_list, node) {
2703 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2707 mutex_unlock(&adev->physical_node_lock);
2715 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2718 * Start from the sane iommu hardware state.
2719 * If the queued invalidation is already initialized by us
2720 * (for example, while enabling interrupt-remapping) then
2721 * we got the things already rolling from a sane state.
2725 * Clear any previous faults.
2727 dmar_fault(-1, iommu);
2729 * Disable queued invalidation if supported and already enabled
2730 * before OS handover.
2732 dmar_disable_qi(iommu);
2735 if (dmar_enable_qi(iommu)) {
2737 * Queued Invalidate not enabled, use Register Based Invalidate
2739 iommu->flush.flush_context = __iommu_flush_context;
2740 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2741 pr_info("IOMMU: %s using Register based invalidation\n",
2744 iommu->flush.flush_context = qi_flush_context;
2745 iommu->flush.flush_iotlb = qi_flush_iotlb;
2746 pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2750 static int __init init_dmars(void)
2752 struct dmar_drhd_unit *drhd;
2753 struct dmar_rmrr_unit *rmrr;
2755 struct intel_iommu *iommu;
2761 * initialize and program root entry to not present
2764 for_each_drhd_unit(drhd) {
2766 * lock not needed as this is only incremented in the single
2767 * threaded kernel __init code path all other access are read
2770 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2774 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2775 DMAR_UNITS_SUPPORTED);
2778 /* Preallocate enough resources for IOMMU hot-addition */
2779 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2780 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2782 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2785 printk(KERN_ERR "Allocating global iommu array failed\n");
2790 deferred_flush = kzalloc(g_num_of_iommus *
2791 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2792 if (!deferred_flush) {
2797 for_each_active_iommu(iommu, drhd) {
2798 g_iommus[iommu->seq_id] = iommu;
2800 ret = iommu_init_domains(iommu);
2806 * we could share the same root & context tables
2807 * among all IOMMU's. Need to Split it later.
2809 ret = iommu_alloc_root_entry(iommu);
2812 if (!ecap_pass_through(iommu->ecap))
2813 hw_pass_through = 0;
2816 for_each_active_iommu(iommu, drhd)
2817 intel_iommu_init_qi(iommu);
2819 if (iommu_pass_through)
2820 iommu_identity_mapping |= IDENTMAP_ALL;
2822 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2823 iommu_identity_mapping |= IDENTMAP_GFX;
2826 check_tylersburg_isoch();
2829 * If pass through is not set or not enabled, setup context entries for
2830 * identity mappings for rmrr, gfx, and isa and may fall back to static
2831 * identity mapping if iommu_identity_mapping is set.
2833 if (iommu_identity_mapping) {
2834 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2836 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2842 * for each dev attached to rmrr
2844 * locate drhd for dev, alloc domain for dev
2845 * allocate free domain
2846 * allocate page table entries for rmrr
2847 * if context not allocated for bus
2848 * allocate and init context
2849 * set present in root table for this bus
2850 * init context with domain, translation etc
2854 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2855 for_each_rmrr_units(rmrr) {
2856 /* some BIOS lists non-exist devices in DMAR table. */
2857 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2859 ret = iommu_prepare_rmrr_dev(rmrr, dev);
2862 "IOMMU: mapping reserved region failed\n");
2866 iommu_prepare_isa();
2871 * global invalidate context cache
2872 * global invalidate iotlb
2873 * enable translation
2875 for_each_iommu(iommu, drhd) {
2876 if (drhd->ignored) {
2878 * we always have to disable PMRs or DMA may fail on
2882 iommu_disable_protect_mem_regions(iommu);
2886 iommu_flush_write_buffer(iommu);
2888 ret = dmar_set_interrupt(iommu);
2892 iommu_set_root_entry(iommu);
2894 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2895 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2896 iommu_enable_translation(iommu);
2897 iommu_disable_protect_mem_regions(iommu);
2903 for_each_active_iommu(iommu, drhd) {
2904 disable_dmar_iommu(iommu);
2905 free_dmar_iommu(iommu);
2907 kfree(deferred_flush);
2914 /* This takes a number of _MM_ pages, not VTD pages */
2915 static struct iova *intel_alloc_iova(struct device *dev,
2916 struct dmar_domain *domain,
2917 unsigned long nrpages, uint64_t dma_mask)
2919 struct iova *iova = NULL;
2921 /* Restrict dma_mask to the width that the iommu can handle */
2922 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2924 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2926 * First try to allocate an io virtual address in
2927 * DMA_BIT_MASK(32) and if that fails then try allocating
2930 iova = alloc_iova(&domain->iovad, nrpages,
2931 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2935 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2936 if (unlikely(!iova)) {
2937 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2938 nrpages, dev_name(dev));
2945 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2947 struct dmar_domain *domain;
2950 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2952 printk(KERN_ERR "Allocating domain for %s failed",
2957 /* make sure context mapping is ok */
2958 if (unlikely(!domain_context_mapped(dev))) {
2959 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2961 printk(KERN_ERR "Domain context map for %s failed",
2970 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2972 struct device_domain_info *info;
2974 /* No lock here, assumes no domain exit in normal case */
2975 info = dev->archdata.iommu;
2977 return info->domain;
2979 return __get_valid_domain_for_dev(dev);
2982 static int iommu_dummy(struct device *dev)
2984 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2987 /* Check if the dev needs to go through non-identity map and unmap process.*/
2988 static int iommu_no_mapping(struct device *dev)
2992 if (iommu_dummy(dev))
2995 if (!iommu_identity_mapping)
2998 found = identity_mapping(dev);
3000 if (iommu_should_identity_map(dev, 0))
3004 * 32 bit DMA is removed from si_domain and fall back
3005 * to non-identity mapping.
3007 domain_remove_one_dev_info(si_domain, dev);
3008 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3014 * In case of a detached 64 bit DMA device from vm, the device
3015 * is put into si_domain for identity mapping.
3017 if (iommu_should_identity_map(dev, 0)) {
3019 ret = domain_add_dev_info(si_domain, dev,
3021 CONTEXT_TT_PASS_THROUGH :
3022 CONTEXT_TT_MULTI_LEVEL);
3024 printk(KERN_INFO "64bit %s uses identity mapping\n",
3034 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3035 size_t size, int dir, u64 dma_mask)
3037 struct dmar_domain *domain;
3038 phys_addr_t start_paddr;
3042 struct intel_iommu *iommu;
3043 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3045 BUG_ON(dir == DMA_NONE);
3047 if (iommu_no_mapping(dev))
3050 domain = get_valid_domain_for_dev(dev);
3054 iommu = domain_get_iommu(domain);
3055 size = aligned_nrpages(paddr, size);
3057 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3062 * Check if DMAR supports zero-length reads on write only
3065 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3066 !cap_zlr(iommu->cap))
3067 prot |= DMA_PTE_READ;
3068 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3069 prot |= DMA_PTE_WRITE;
3071 * paddr - (paddr + size) might be partial page, we should map the whole
3072 * page. Note: if two part of one page are separately mapped, we
3073 * might have two guest_addr mapping to the same host paddr, but this
3074 * is not a big problem
3076 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3077 mm_to_dma_pfn(paddr_pfn), size, prot);
3081 /* it's a non-present to present mapping. Only flush if caching mode */
3082 if (cap_caching_mode(iommu->cap))
3083 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3085 iommu_flush_write_buffer(iommu);
3087 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3088 start_paddr += paddr & ~PAGE_MASK;
3093 __free_iova(&domain->iovad, iova);
3094 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3095 dev_name(dev), size, (unsigned long long)paddr, dir);
3099 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3100 unsigned long offset, size_t size,
3101 enum dma_data_direction dir,
3102 struct dma_attrs *attrs)
3104 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3105 dir, *dev->dma_mask);
3108 static void flush_unmaps(void)
3114 /* just flush them all */
3115 for (i = 0; i < g_num_of_iommus; i++) {
3116 struct intel_iommu *iommu = g_iommus[i];
3120 if (!deferred_flush[i].next)
3123 /* In caching mode, global flushes turn emulation expensive */
3124 if (!cap_caching_mode(iommu->cap))
3125 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3126 DMA_TLB_GLOBAL_FLUSH);
3127 for (j = 0; j < deferred_flush[i].next; j++) {
3129 struct iova *iova = deferred_flush[i].iova[j];
3130 struct dmar_domain *domain = deferred_flush[i].domain[j];
3132 /* On real hardware multiple invalidations are expensive */
3133 if (cap_caching_mode(iommu->cap))
3134 iommu_flush_iotlb_psi(iommu, domain->id,
3135 iova->pfn_lo, iova_size(iova),
3136 !deferred_flush[i].freelist[j], 0);
3138 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3139 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3140 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3142 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3143 if (deferred_flush[i].freelist[j])
3144 dma_free_pagelist(deferred_flush[i].freelist[j]);
3146 deferred_flush[i].next = 0;
3152 static void flush_unmaps_timeout(unsigned long data)
3154 unsigned long flags;
3156 spin_lock_irqsave(&async_umap_flush_lock, flags);
3158 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3161 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3163 unsigned long flags;
3165 struct intel_iommu *iommu;
3167 spin_lock_irqsave(&async_umap_flush_lock, flags);
3168 if (list_size == HIGH_WATER_MARK)
3171 iommu = domain_get_iommu(dom);
3172 iommu_id = iommu->seq_id;
3174 next = deferred_flush[iommu_id].next;
3175 deferred_flush[iommu_id].domain[next] = dom;
3176 deferred_flush[iommu_id].iova[next] = iova;
3177 deferred_flush[iommu_id].freelist[next] = freelist;
3178 deferred_flush[iommu_id].next++;
3181 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3185 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3188 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3190 struct dmar_domain *domain;
3191 unsigned long start_pfn, last_pfn;
3193 struct intel_iommu *iommu;
3194 struct page *freelist;
3196 if (iommu_no_mapping(dev))
3199 domain = find_domain(dev);
3202 iommu = domain_get_iommu(domain);
3204 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3205 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3206 (unsigned long long)dev_addr))
3209 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3210 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3212 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3213 dev_name(dev), start_pfn, last_pfn);
3215 freelist = domain_unmap(domain, start_pfn, last_pfn);
3217 if (intel_iommu_strict) {
3218 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3219 last_pfn - start_pfn + 1, !freelist, 0);
3221 __free_iova(&domain->iovad, iova);
3222 dma_free_pagelist(freelist);
3224 add_unmap(domain, iova, freelist);
3226 * queue up the release of the unmap to save the 1/6th of the
3227 * cpu used up by the iotlb flush operation...
3232 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3233 size_t size, enum dma_data_direction dir,
3234 struct dma_attrs *attrs)
3236 intel_unmap(dev, dev_addr);
3239 static void *intel_alloc_coherent(struct device *dev, size_t size,
3240 dma_addr_t *dma_handle, gfp_t flags,
3241 struct dma_attrs *attrs)
3243 struct page *page = NULL;
3246 size = PAGE_ALIGN(size);
3247 order = get_order(size);
3249 if (!iommu_no_mapping(dev))
3250 flags &= ~(GFP_DMA | GFP_DMA32);
3251 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3252 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3258 if (flags & __GFP_WAIT) {
3259 unsigned int count = size >> PAGE_SHIFT;
3261 page = dma_alloc_from_contiguous(dev, count, order);
3262 if (page && iommu_no_mapping(dev) &&
3263 page_to_phys(page) + size > dev->coherent_dma_mask) {
3264 dma_release_from_contiguous(dev, page, count);
3270 page = alloc_pages(flags, order);
3273 memset(page_address(page), 0, size);
3275 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3277 dev->coherent_dma_mask);
3279 return page_address(page);
3280 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3281 __free_pages(page, order);
3286 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3287 dma_addr_t dma_handle, struct dma_attrs *attrs)
3290 struct page *page = virt_to_page(vaddr);
3292 size = PAGE_ALIGN(size);
3293 order = get_order(size);
3295 intel_unmap(dev, dma_handle);
3296 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3297 __free_pages(page, order);
3300 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3301 int nelems, enum dma_data_direction dir,
3302 struct dma_attrs *attrs)
3304 intel_unmap(dev, sglist[0].dma_address);
3307 static int intel_nontranslate_map_sg(struct device *hddev,
3308 struct scatterlist *sglist, int nelems, int dir)
3311 struct scatterlist *sg;
3313 for_each_sg(sglist, sg, nelems, i) {
3314 BUG_ON(!sg_page(sg));
3315 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3316 sg->dma_length = sg->length;
3321 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3322 enum dma_data_direction dir, struct dma_attrs *attrs)
3325 struct dmar_domain *domain;
3328 struct iova *iova = NULL;
3330 struct scatterlist *sg;
3331 unsigned long start_vpfn;
3332 struct intel_iommu *iommu;
3334 BUG_ON(dir == DMA_NONE);
3335 if (iommu_no_mapping(dev))
3336 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3338 domain = get_valid_domain_for_dev(dev);
3342 iommu = domain_get_iommu(domain);
3344 for_each_sg(sglist, sg, nelems, i)
3345 size += aligned_nrpages(sg->offset, sg->length);
3347 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3350 sglist->dma_length = 0;
3355 * Check if DMAR supports zero-length reads on write only
3358 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3359 !cap_zlr(iommu->cap))
3360 prot |= DMA_PTE_READ;
3361 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3362 prot |= DMA_PTE_WRITE;
3364 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3366 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3367 if (unlikely(ret)) {
3368 dma_pte_free_pagetable(domain, start_vpfn,
3369 start_vpfn + size - 1);
3370 __free_iova(&domain->iovad, iova);
3374 /* it's a non-present to present mapping. Only flush if caching mode */
3375 if (cap_caching_mode(iommu->cap))
3376 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3378 iommu_flush_write_buffer(iommu);
3383 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3388 struct dma_map_ops intel_dma_ops = {
3389 .alloc = intel_alloc_coherent,
3390 .free = intel_free_coherent,
3391 .map_sg = intel_map_sg,
3392 .unmap_sg = intel_unmap_sg,
3393 .map_page = intel_map_page,
3394 .unmap_page = intel_unmap_page,
3395 .mapping_error = intel_mapping_error,
3398 static inline int iommu_domain_cache_init(void)
3402 iommu_domain_cache = kmem_cache_create("iommu_domain",
3403 sizeof(struct dmar_domain),
3408 if (!iommu_domain_cache) {
3409 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3416 static inline int iommu_devinfo_cache_init(void)
3420 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3421 sizeof(struct device_domain_info),
3425 if (!iommu_devinfo_cache) {
3426 printk(KERN_ERR "Couldn't create devinfo cache\n");
3433 static int __init iommu_init_mempool(void)
3436 ret = iommu_iova_cache_init();
3440 ret = iommu_domain_cache_init();
3444 ret = iommu_devinfo_cache_init();
3448 kmem_cache_destroy(iommu_domain_cache);
3450 iommu_iova_cache_destroy();
3455 static void __init iommu_exit_mempool(void)
3457 kmem_cache_destroy(iommu_devinfo_cache);
3458 kmem_cache_destroy(iommu_domain_cache);
3459 iommu_iova_cache_destroy();
3462 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3464 struct dmar_drhd_unit *drhd;
3468 /* We know that this device on this chipset has its own IOMMU.
3469 * If we find it under a different IOMMU, then the BIOS is lying
3470 * to us. Hope that the IOMMU for this device is actually
3471 * disabled, and it needs no translation...
3473 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3475 /* "can't" happen */
3476 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3479 vtbar &= 0xffff0000;
3481 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3482 drhd = dmar_find_matched_drhd_unit(pdev);
3483 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3484 TAINT_FIRMWARE_WORKAROUND,
3485 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3486 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3488 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3490 static void __init init_no_remapping_devices(void)
3492 struct dmar_drhd_unit *drhd;
3496 for_each_drhd_unit(drhd) {
3497 if (!drhd->include_all) {
3498 for_each_active_dev_scope(drhd->devices,
3499 drhd->devices_cnt, i, dev)
3501 /* ignore DMAR unit if no devices exist */
3502 if (i == drhd->devices_cnt)
3507 for_each_active_drhd_unit(drhd) {
3508 if (drhd->include_all)
3511 for_each_active_dev_scope(drhd->devices,
3512 drhd->devices_cnt, i, dev)
3513 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3515 if (i < drhd->devices_cnt)
3518 /* This IOMMU has *only* gfx devices. Either bypass it or
3519 set the gfx_mapped flag, as appropriate */
3521 intel_iommu_gfx_mapped = 1;
3524 for_each_active_dev_scope(drhd->devices,
3525 drhd->devices_cnt, i, dev)
3526 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3531 #ifdef CONFIG_SUSPEND
3532 static int init_iommu_hw(void)
3534 struct dmar_drhd_unit *drhd;
3535 struct intel_iommu *iommu = NULL;
3537 for_each_active_iommu(iommu, drhd)
3539 dmar_reenable_qi(iommu);
3541 for_each_iommu(iommu, drhd) {
3542 if (drhd->ignored) {
3544 * we always have to disable PMRs or DMA may fail on
3548 iommu_disable_protect_mem_regions(iommu);
3552 iommu_flush_write_buffer(iommu);
3554 iommu_set_root_entry(iommu);
3556 iommu->flush.flush_context(iommu, 0, 0, 0,
3557 DMA_CCMD_GLOBAL_INVL);
3558 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3559 iommu_enable_translation(iommu);
3560 iommu_disable_protect_mem_regions(iommu);
3566 static void iommu_flush_all(void)
3568 struct dmar_drhd_unit *drhd;
3569 struct intel_iommu *iommu;
3571 for_each_active_iommu(iommu, drhd) {
3572 iommu->flush.flush_context(iommu, 0, 0, 0,
3573 DMA_CCMD_GLOBAL_INVL);
3574 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3575 DMA_TLB_GLOBAL_FLUSH);
3579 static int iommu_suspend(void)
3581 struct dmar_drhd_unit *drhd;
3582 struct intel_iommu *iommu = NULL;
3585 for_each_active_iommu(iommu, drhd) {
3586 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3588 if (!iommu->iommu_state)
3594 for_each_active_iommu(iommu, drhd) {
3595 iommu_disable_translation(iommu);
3597 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3599 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3600 readl(iommu->reg + DMAR_FECTL_REG);
3601 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3602 readl(iommu->reg + DMAR_FEDATA_REG);
3603 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3604 readl(iommu->reg + DMAR_FEADDR_REG);
3605 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3606 readl(iommu->reg + DMAR_FEUADDR_REG);
3608 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3613 for_each_active_iommu(iommu, drhd)
3614 kfree(iommu->iommu_state);
3619 static void iommu_resume(void)
3621 struct dmar_drhd_unit *drhd;
3622 struct intel_iommu *iommu = NULL;
3625 if (init_iommu_hw()) {
3627 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3629 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3633 for_each_active_iommu(iommu, drhd) {
3635 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3637 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3638 iommu->reg + DMAR_FECTL_REG);
3639 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3640 iommu->reg + DMAR_FEDATA_REG);
3641 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3642 iommu->reg + DMAR_FEADDR_REG);
3643 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3644 iommu->reg + DMAR_FEUADDR_REG);
3646 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3649 for_each_active_iommu(iommu, drhd)
3650 kfree(iommu->iommu_state);
3653 static struct syscore_ops iommu_syscore_ops = {
3654 .resume = iommu_resume,
3655 .suspend = iommu_suspend,
3658 static void __init init_iommu_pm_ops(void)
3660 register_syscore_ops(&iommu_syscore_ops);
3664 static inline void init_iommu_pm_ops(void) {}
3665 #endif /* CONFIG_PM */
3668 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3670 struct acpi_dmar_reserved_memory *rmrr;
3671 struct dmar_rmrr_unit *rmrru;
3673 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3677 rmrru->hdr = header;
3678 rmrr = (struct acpi_dmar_reserved_memory *)header;
3679 rmrru->base_address = rmrr->base_address;
3680 rmrru->end_address = rmrr->end_address;
3681 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3682 ((void *)rmrr) + rmrr->header.length,
3683 &rmrru->devices_cnt);
3684 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3689 list_add(&rmrru->list, &dmar_rmrr_units);
3694 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3696 struct dmar_atsr_unit *atsru;
3697 struct acpi_dmar_atsr *tmp;
3699 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3700 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3701 if (atsr->segment != tmp->segment)
3703 if (atsr->header.length != tmp->header.length)
3705 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3712 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3714 struct acpi_dmar_atsr *atsr;
3715 struct dmar_atsr_unit *atsru;
3717 if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3720 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3721 atsru = dmar_find_atsr(atsr);
3725 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3730 * If memory is allocated from slab by ACPI _DSM method, we need to
3731 * copy the memory content because the memory buffer will be freed
3734 atsru->hdr = (void *)(atsru + 1);
3735 memcpy(atsru->hdr, hdr, hdr->length);
3736 atsru->include_all = atsr->flags & 0x1;
3737 if (!atsru->include_all) {
3738 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3739 (void *)atsr + atsr->header.length,
3740 &atsru->devices_cnt);
3741 if (atsru->devices_cnt && atsru->devices == NULL) {
3747 list_add_rcu(&atsru->list, &dmar_atsr_units);
3752 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3754 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3758 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3760 struct acpi_dmar_atsr *atsr;
3761 struct dmar_atsr_unit *atsru;
3763 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3764 atsru = dmar_find_atsr(atsr);
3766 list_del_rcu(&atsru->list);
3768 intel_iommu_free_atsr(atsru);
3774 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3778 struct acpi_dmar_atsr *atsr;
3779 struct dmar_atsr_unit *atsru;
3781 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3782 atsru = dmar_find_atsr(atsr);
3786 if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
3787 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3794 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3797 struct intel_iommu *iommu = dmaru->iommu;
3799 if (g_iommus[iommu->seq_id])
3802 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3803 pr_warn("IOMMU: %s doesn't support hardware pass through.\n",
3807 if (!ecap_sc_support(iommu->ecap) &&
3808 domain_update_iommu_snooping(iommu)) {
3809 pr_warn("IOMMU: %s doesn't support snooping.\n",
3813 sp = domain_update_iommu_superpage(iommu) - 1;
3814 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3815 pr_warn("IOMMU: %s doesn't support large page.\n",
3821 * Disable translation if already enabled prior to OS handover.
3823 if (iommu->gcmd & DMA_GCMD_TE)
3824 iommu_disable_translation(iommu);
3826 g_iommus[iommu->seq_id] = iommu;
3827 ret = iommu_init_domains(iommu);
3829 ret = iommu_alloc_root_entry(iommu);
3833 if (dmaru->ignored) {
3835 * we always have to disable PMRs or DMA may fail on this device
3838 iommu_disable_protect_mem_regions(iommu);
3842 intel_iommu_init_qi(iommu);
3843 iommu_flush_write_buffer(iommu);
3844 ret = dmar_set_interrupt(iommu);
3848 iommu_set_root_entry(iommu);
3849 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3850 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3851 iommu_enable_translation(iommu);
3854 ret = iommu_attach_domain(si_domain, iommu);
3855 if (ret < 0 || si_domain->id != ret)
3857 domain_attach_iommu(si_domain, iommu);
3860 iommu_disable_protect_mem_regions(iommu);
3864 disable_dmar_iommu(iommu);
3866 free_dmar_iommu(iommu);
3870 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3873 struct intel_iommu *iommu = dmaru->iommu;
3875 if (!intel_iommu_enabled)
3881 ret = intel_iommu_add(dmaru);
3883 disable_dmar_iommu(iommu);
3884 free_dmar_iommu(iommu);
3890 static void intel_iommu_free_dmars(void)
3892 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3893 struct dmar_atsr_unit *atsru, *atsr_n;
3895 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3896 list_del(&rmrru->list);
3897 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3901 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3902 list_del(&atsru->list);
3903 intel_iommu_free_atsr(atsru);
3907 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3910 struct pci_bus *bus;
3911 struct pci_dev *bridge = NULL;
3913 struct acpi_dmar_atsr *atsr;
3914 struct dmar_atsr_unit *atsru;
3916 dev = pci_physfn(dev);
3917 for (bus = dev->bus; bus; bus = bus->parent) {
3919 if (!bridge || !pci_is_pcie(bridge) ||
3920 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3922 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3929 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3930 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3931 if (atsr->segment != pci_domain_nr(dev->bus))
3934 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3935 if (tmp == &bridge->dev)
3938 if (atsru->include_all)
3948 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3951 struct dmar_rmrr_unit *rmrru;
3952 struct dmar_atsr_unit *atsru;
3953 struct acpi_dmar_atsr *atsr;
3954 struct acpi_dmar_reserved_memory *rmrr;
3956 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3959 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3960 rmrr = container_of(rmrru->hdr,
3961 struct acpi_dmar_reserved_memory, header);
3962 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3963 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3964 ((void *)rmrr) + rmrr->header.length,
3965 rmrr->segment, rmrru->devices,
3966 rmrru->devices_cnt);
3969 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3970 dmar_remove_dev_scope(info, rmrr->segment,
3971 rmrru->devices, rmrru->devices_cnt);
3975 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3976 if (atsru->include_all)
3979 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3980 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3981 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3982 (void *)atsr + atsr->header.length,
3983 atsr->segment, atsru->devices,
3984 atsru->devices_cnt);
3989 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3990 if (dmar_remove_dev_scope(info, atsr->segment,
3991 atsru->devices, atsru->devices_cnt))
4000 * Here we only respond to action of unbound device from driver.
4002 * Added device is not attached to its DMAR domain here yet. That will happen
4003 * when mapping the device to iova.
4005 static int device_notifier(struct notifier_block *nb,
4006 unsigned long action, void *data)
4008 struct device *dev = data;
4009 struct dmar_domain *domain;
4011 if (iommu_dummy(dev))
4014 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4017 domain = find_domain(dev);
4021 down_read(&dmar_global_lock);
4022 domain_remove_one_dev_info(domain, dev);
4023 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4024 domain_exit(domain);
4025 up_read(&dmar_global_lock);
4030 static struct notifier_block device_nb = {
4031 .notifier_call = device_notifier,
4034 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4035 unsigned long val, void *v)
4037 struct memory_notify *mhp = v;
4038 unsigned long long start, end;
4039 unsigned long start_vpfn, last_vpfn;
4042 case MEM_GOING_ONLINE:
4043 start = mhp->start_pfn << PAGE_SHIFT;
4044 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4045 if (iommu_domain_identity_map(si_domain, start, end)) {
4046 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
4053 case MEM_CANCEL_ONLINE:
4054 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4055 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4056 while (start_vpfn <= last_vpfn) {
4058 struct dmar_drhd_unit *drhd;
4059 struct intel_iommu *iommu;
4060 struct page *freelist;
4062 iova = find_iova(&si_domain->iovad, start_vpfn);
4064 pr_debug("dmar: failed get IOVA for PFN %lx\n",
4069 iova = split_and_remove_iova(&si_domain->iovad, iova,
4070 start_vpfn, last_vpfn);
4072 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
4073 start_vpfn, last_vpfn);
4077 freelist = domain_unmap(si_domain, iova->pfn_lo,
4081 for_each_active_iommu(iommu, drhd)
4082 iommu_flush_iotlb_psi(iommu, si_domain->id,
4083 iova->pfn_lo, iova_size(iova),
4086 dma_free_pagelist(freelist);
4088 start_vpfn = iova->pfn_hi + 1;
4089 free_iova_mem(iova);
4097 static struct notifier_block intel_iommu_memory_nb = {
4098 .notifier_call = intel_iommu_memory_notifier,
4103 static ssize_t intel_iommu_show_version(struct device *dev,
4104 struct device_attribute *attr,
4107 struct intel_iommu *iommu = dev_get_drvdata(dev);
4108 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4109 return sprintf(buf, "%d:%d\n",
4110 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4112 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4114 static ssize_t intel_iommu_show_address(struct device *dev,
4115 struct device_attribute *attr,
4118 struct intel_iommu *iommu = dev_get_drvdata(dev);
4119 return sprintf(buf, "%llx\n", iommu->reg_phys);
4121 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4123 static ssize_t intel_iommu_show_cap(struct device *dev,
4124 struct device_attribute *attr,
4127 struct intel_iommu *iommu = dev_get_drvdata(dev);
4128 return sprintf(buf, "%llx\n", iommu->cap);
4130 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4132 static ssize_t intel_iommu_show_ecap(struct device *dev,
4133 struct device_attribute *attr,
4136 struct intel_iommu *iommu = dev_get_drvdata(dev);
4137 return sprintf(buf, "%llx\n", iommu->ecap);
4139 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4141 static struct attribute *intel_iommu_attrs[] = {
4142 &dev_attr_version.attr,
4143 &dev_attr_address.attr,
4145 &dev_attr_ecap.attr,
4149 static struct attribute_group intel_iommu_group = {
4150 .name = "intel-iommu",
4151 .attrs = intel_iommu_attrs,
4154 const struct attribute_group *intel_iommu_groups[] = {
4159 int __init intel_iommu_init(void)
4162 struct dmar_drhd_unit *drhd;
4163 struct intel_iommu *iommu;
4165 /* VT-d is required for a TXT/tboot launch, so enforce that */
4166 force_on = tboot_force_iommu();
4168 if (iommu_init_mempool()) {
4170 panic("tboot: Failed to initialize iommu memory\n");
4174 down_write(&dmar_global_lock);
4175 if (dmar_table_init()) {
4177 panic("tboot: Failed to initialize DMAR table\n");
4182 * Disable translation if already enabled prior to OS handover.
4184 for_each_active_iommu(iommu, drhd)
4185 if (iommu->gcmd & DMA_GCMD_TE)
4186 iommu_disable_translation(iommu);
4188 if (dmar_dev_scope_init() < 0) {
4190 panic("tboot: Failed to initialize DMAR device scope\n");
4194 if (no_iommu || dmar_disabled)
4197 if (list_empty(&dmar_rmrr_units))
4198 printk(KERN_INFO "DMAR: No RMRR found\n");
4200 if (list_empty(&dmar_atsr_units))
4201 printk(KERN_INFO "DMAR: No ATSR found\n");
4203 if (dmar_init_reserved_ranges()) {
4205 panic("tboot: Failed to reserve iommu ranges\n");
4206 goto out_free_reserved_range;
4209 init_no_remapping_devices();
4214 panic("tboot: Failed to initialize DMARs\n");
4215 printk(KERN_ERR "IOMMU: dmar init failed\n");
4216 goto out_free_reserved_range;
4218 up_write(&dmar_global_lock);
4220 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4222 init_timer(&unmap_timer);
4223 #ifdef CONFIG_SWIOTLB
4226 dma_ops = &intel_dma_ops;
4228 init_iommu_pm_ops();
4230 for_each_active_iommu(iommu, drhd)
4231 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4235 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4236 bus_register_notifier(&pci_bus_type, &device_nb);
4237 if (si_domain && !hw_pass_through)
4238 register_memory_notifier(&intel_iommu_memory_nb);
4240 intel_iommu_enabled = 1;
4244 out_free_reserved_range:
4245 put_iova_domain(&reserved_iova_list);
4247 intel_iommu_free_dmars();
4248 up_write(&dmar_global_lock);
4249 iommu_exit_mempool();
4253 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4255 struct intel_iommu *iommu = opaque;
4257 iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4262 * NB - intel-iommu lacks any sort of reference counting for the users of
4263 * dependent devices. If multiple endpoints have intersecting dependent
4264 * devices, unbinding the driver from any one of them will possibly leave
4265 * the others unable to operate.
4267 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4270 if (!iommu || !dev || !dev_is_pci(dev))
4273 pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4276 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4279 struct device_domain_info *info, *tmp;
4280 struct intel_iommu *iommu;
4281 unsigned long flags;
4285 iommu = device_to_iommu(dev, &bus, &devfn);
4289 spin_lock_irqsave(&device_domain_lock, flags);
4290 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4291 if (info->iommu == iommu && info->bus == bus &&
4292 info->devfn == devfn) {
4293 unlink_domain_info(info);
4294 spin_unlock_irqrestore(&device_domain_lock, flags);
4296 iommu_disable_dev_iotlb(info);
4297 iommu_detach_dev(iommu, info->bus, info->devfn);
4298 iommu_detach_dependent_devices(iommu, dev);
4299 free_devinfo_mem(info);
4301 spin_lock_irqsave(&device_domain_lock, flags);
4309 /* if there is no other devices under the same iommu
4310 * owned by this domain, clear this iommu in iommu_bmp
4311 * update iommu count and coherency
4313 if (info->iommu == iommu)
4317 spin_unlock_irqrestore(&device_domain_lock, flags);
4320 domain_detach_iommu(domain, iommu);
4321 if (!domain_type_is_vm_or_si(domain))
4322 iommu_detach_domain(domain, iommu);
4326 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4330 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4332 domain_reserve_special_ranges(domain);
4334 /* calculate AGAW */
4335 domain->gaw = guest_width;
4336 adjust_width = guestwidth_to_adjustwidth(guest_width);
4337 domain->agaw = width_to_agaw(adjust_width);
4339 domain->iommu_coherency = 0;
4340 domain->iommu_snooping = 0;
4341 domain->iommu_superpage = 0;
4342 domain->max_addr = 0;
4344 /* always allocate the top pgd */
4345 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4348 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4352 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4354 struct dmar_domain *dmar_domain;
4355 struct iommu_domain *domain;
4357 if (type != IOMMU_DOMAIN_UNMANAGED)
4360 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4363 "intel_iommu_domain_init: dmar_domain == NULL\n");
4366 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4368 "intel_iommu_domain_init() failed\n");
4369 domain_exit(dmar_domain);
4372 domain_update_iommu_cap(dmar_domain);
4374 domain = &dmar_domain->domain;
4375 domain->geometry.aperture_start = 0;
4376 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4377 domain->geometry.force_aperture = true;
4382 static void intel_iommu_domain_free(struct iommu_domain *domain)
4384 domain_exit(to_dmar_domain(domain));
4387 static int intel_iommu_attach_device(struct iommu_domain *domain,
4390 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391 struct intel_iommu *iommu;
4395 if (device_is_rmrr_locked(dev)) {
4396 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4400 /* normally dev is not mapped */
4401 if (unlikely(domain_context_mapped(dev))) {
4402 struct dmar_domain *old_domain;
4404 old_domain = find_domain(dev);
4406 if (domain_type_is_vm_or_si(dmar_domain))
4407 domain_remove_one_dev_info(old_domain, dev);
4409 domain_remove_dev_info(old_domain);
4411 if (!domain_type_is_vm_or_si(old_domain) &&
4412 list_empty(&old_domain->devices))
4413 domain_exit(old_domain);
4417 iommu = device_to_iommu(dev, &bus, &devfn);
4421 /* check if this iommu agaw is sufficient for max mapped address */
4422 addr_width = agaw_to_width(iommu->agaw);
4423 if (addr_width > cap_mgaw(iommu->cap))
4424 addr_width = cap_mgaw(iommu->cap);
4426 if (dmar_domain->max_addr > (1LL << addr_width)) {
4427 printk(KERN_ERR "%s: iommu width (%d) is not "
4428 "sufficient for the mapped address (%llx)\n",
4429 __func__, addr_width, dmar_domain->max_addr);
4432 dmar_domain->gaw = addr_width;
4435 * Knock out extra levels of page tables if necessary
4437 while (iommu->agaw < dmar_domain->agaw) {
4438 struct dma_pte *pte;
4440 pte = dmar_domain->pgd;
4441 if (dma_pte_present(pte)) {
4442 dmar_domain->pgd = (struct dma_pte *)
4443 phys_to_virt(dma_pte_addr(pte));
4444 free_pgtable_page(pte);
4446 dmar_domain->agaw--;
4449 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4452 static void intel_iommu_detach_device(struct iommu_domain *domain,
4455 domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4458 static int intel_iommu_map(struct iommu_domain *domain,
4459 unsigned long iova, phys_addr_t hpa,
4460 size_t size, int iommu_prot)
4462 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4467 if (iommu_prot & IOMMU_READ)
4468 prot |= DMA_PTE_READ;
4469 if (iommu_prot & IOMMU_WRITE)
4470 prot |= DMA_PTE_WRITE;
4471 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4472 prot |= DMA_PTE_SNP;
4474 max_addr = iova + size;
4475 if (dmar_domain->max_addr < max_addr) {
4478 /* check if minimum agaw is sufficient for mapped address */
4479 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4480 if (end < max_addr) {
4481 printk(KERN_ERR "%s: iommu width (%d) is not "
4482 "sufficient for the mapped address (%llx)\n",
4483 __func__, dmar_domain->gaw, max_addr);
4486 dmar_domain->max_addr = max_addr;
4488 /* Round up size to next multiple of PAGE_SIZE, if it and
4489 the low bits of hpa would take us onto the next page */
4490 size = aligned_nrpages(hpa, size);
4491 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4492 hpa >> VTD_PAGE_SHIFT, size, prot);
4496 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4497 unsigned long iova, size_t size)
4499 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4500 struct page *freelist = NULL;
4501 struct intel_iommu *iommu;
4502 unsigned long start_pfn, last_pfn;
4503 unsigned int npages;
4504 int iommu_id, num, ndomains, level = 0;
4506 /* Cope with horrid API which requires us to unmap more than the
4507 size argument if it happens to be a large-page mapping. */
4508 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4511 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4512 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4514 start_pfn = iova >> VTD_PAGE_SHIFT;
4515 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4517 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4519 npages = last_pfn - start_pfn + 1;
4521 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4522 iommu = g_iommus[iommu_id];
4525 * find bit position of dmar_domain
4527 ndomains = cap_ndoms(iommu->cap);
4528 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4529 if (iommu->domains[num] == dmar_domain)
4530 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4531 npages, !freelist, 0);
4536 dma_free_pagelist(freelist);
4538 if (dmar_domain->max_addr == iova + size)
4539 dmar_domain->max_addr = iova;
4544 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4547 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4548 struct dma_pte *pte;
4552 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4554 phys = dma_pte_addr(pte);
4559 static bool intel_iommu_capable(enum iommu_cap cap)
4561 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4562 return domain_update_iommu_snooping(NULL) == 1;
4563 if (cap == IOMMU_CAP_INTR_REMAP)
4564 return irq_remapping_enabled == 1;
4569 static int intel_iommu_add_device(struct device *dev)
4571 struct intel_iommu *iommu;
4572 struct iommu_group *group;
4575 iommu = device_to_iommu(dev, &bus, &devfn);
4579 iommu_device_link(iommu->iommu_dev, dev);
4581 group = iommu_group_get_for_dev(dev);
4584 return PTR_ERR(group);
4586 iommu_group_put(group);
4590 static void intel_iommu_remove_device(struct device *dev)
4592 struct intel_iommu *iommu;
4595 iommu = device_to_iommu(dev, &bus, &devfn);
4599 iommu_group_remove_device(dev);
4601 iommu_device_unlink(iommu->iommu_dev, dev);
4604 static const struct iommu_ops intel_iommu_ops = {
4605 .capable = intel_iommu_capable,
4606 .domain_alloc = intel_iommu_domain_alloc,
4607 .domain_free = intel_iommu_domain_free,
4608 .attach_dev = intel_iommu_attach_device,
4609 .detach_dev = intel_iommu_detach_device,
4610 .map = intel_iommu_map,
4611 .unmap = intel_iommu_unmap,
4612 .map_sg = default_iommu_map_sg,
4613 .iova_to_phys = intel_iommu_iova_to_phys,
4614 .add_device = intel_iommu_add_device,
4615 .remove_device = intel_iommu_remove_device,
4616 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4619 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4621 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4622 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4628 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4634 static void quirk_iommu_rwbf(struct pci_dev *dev)
4637 * Mobile 4 Series Chipset neglects to set RWBF capability,
4638 * but needs it. Same seems to hold for the desktop versions.
4640 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4653 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4654 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4655 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4656 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4657 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4658 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4659 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4660 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4662 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4666 if (pci_read_config_word(dev, GGC, &ggc))
4669 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4670 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4672 } else if (dmar_map_gfx) {
4673 /* we have to ensure the gfx device is idle before we flush */
4674 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4675 intel_iommu_strict = 1;
4678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4680 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4681 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4683 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4684 ISOCH DMAR unit for the Azalia sound device, but not give it any
4685 TLB entries, which causes it to deadlock. Check for that. We do
4686 this in a function called from init_dmars(), instead of in a PCI
4687 quirk, because we don't want to print the obnoxious "BIOS broken"
4688 message if VT-d is actually disabled.
4690 static void __init check_tylersburg_isoch(void)
4692 struct pci_dev *pdev;
4693 uint32_t vtisochctrl;
4695 /* If there's no Azalia in the system anyway, forget it. */
4696 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4701 /* System Management Registers. Might be hidden, in which case
4702 we can't do the sanity check. But that's OK, because the
4703 known-broken BIOSes _don't_ actually hide it, so far. */
4704 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4708 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4715 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4716 if (vtisochctrl & 1)
4719 /* Drop all bits other than the number of TLB entries */
4720 vtisochctrl &= 0x1c;
4722 /* If we have the recommended number of TLB entries (16), fine. */
4723 if (vtisochctrl == 0x10)
4726 /* Zero TLB entries? You get to ride the short bus to school. */
4728 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4729 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4730 dmi_get_system_info(DMI_BIOS_VENDOR),
4731 dmi_get_system_info(DMI_BIOS_VERSION),
4732 dmi_get_system_info(DMI_PRODUCT_VERSION));
4733 iommu_identity_mapping |= IDENTMAP_AZALIA;
4737 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",