2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
47 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74 /* IO virtual address start page frame number */
75 #define IOVA_START_PFN (1)
77 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
78 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
79 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
81 /* page table handling */
82 #define LEVEL_STRIDE (9)
83 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
86 * This bitmap is used to advertise the page sizes our hardware support
87 * to the IOMMU core, which will then use this information to split
88 * physically contiguous memory regions it is mapping into page sizes
91 * Traditionally the IOMMU core just handed us the mappings directly,
92 * after making sure the size is an order of a 4KiB page and that the
93 * mapping has natural alignment.
95 * To retain this behavior, we currently advertise that we support
96 * all page sizes that are an order of 4KiB.
98 * If at some point we'd like to utilize the IOMMU core's new behavior,
99 * we could change this to advertise the real page sizes we support.
101 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
103 static inline int agaw_to_level(int agaw)
108 static inline int agaw_to_width(int agaw)
110 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
113 static inline int width_to_agaw(int width)
115 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
118 static inline unsigned int level_to_offset_bits(int level)
120 return (level - 1) * LEVEL_STRIDE;
123 static inline int pfn_level_offset(unsigned long pfn, int level)
125 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
128 static inline unsigned long level_mask(int level)
130 return -1UL << level_to_offset_bits(level);
133 static inline unsigned long level_size(int level)
135 return 1UL << level_to_offset_bits(level);
138 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 return (pfn + level_size(level) - 1) & level_mask(level);
143 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
148 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
149 are never going to work. */
150 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 return mm_to_dma_pfn(page_to_pfn(pg));
163 static inline unsigned long virt_to_dma_pfn(void *p)
165 return page_to_dma_pfn(virt_to_page(p));
168 /* global iommu list, set NULL for ignored DMAR units */
169 static struct intel_iommu **g_iommus;
171 static void __init check_tylersburg_isoch(void);
172 static int rwbf_quirk;
175 * set to 1 to panic kernel if can't successfully enable VT-d
176 * (used when kernel is launched w/ TXT)
178 static int force_on = 0;
183 * 12-63: Context Ptr (12 - (haw-1))
190 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
191 static inline bool root_present(struct root_entry *root)
193 return (root->val & 1);
195 static inline void set_root_present(struct root_entry *root)
199 static inline void set_root_value(struct root_entry *root, unsigned long value)
201 root->val &= ~VTD_PAGE_MASK;
202 root->val |= value & VTD_PAGE_MASK;
205 static inline struct context_entry *
206 get_context_addr_from_root(struct root_entry *root)
208 return (struct context_entry *)
209 (root_present(root)?phys_to_virt(
210 root->val & VTD_PAGE_MASK) :
217 * 1: fault processing disable
218 * 2-3: translation type
219 * 12-63: address space root
225 struct context_entry {
230 static inline bool context_present(struct context_entry *context)
232 return (context->lo & 1);
234 static inline void context_set_present(struct context_entry *context)
239 static inline void context_set_fault_enable(struct context_entry *context)
241 context->lo &= (((u64)-1) << 2) | 1;
244 static inline void context_set_translation_type(struct context_entry *context,
247 context->lo &= (((u64)-1) << 4) | 3;
248 context->lo |= (value & 3) << 2;
251 static inline void context_set_address_root(struct context_entry *context,
254 context->lo &= ~VTD_PAGE_MASK;
255 context->lo |= value & VTD_PAGE_MASK;
258 static inline void context_set_address_width(struct context_entry *context,
261 context->hi |= value & 7;
264 static inline void context_set_domain_id(struct context_entry *context,
267 context->hi |= (value & ((1 << 16) - 1)) << 8;
270 static inline void context_clear_entry(struct context_entry *context)
283 * 12-63: Host physcial address
289 static inline void dma_clear_pte(struct dma_pte *pte)
294 static inline u64 dma_pte_addr(struct dma_pte *pte)
297 return pte->val & VTD_PAGE_MASK;
299 /* Must have a full atomic 64-bit read */
300 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
304 static inline bool dma_pte_present(struct dma_pte *pte)
306 return (pte->val & 3) != 0;
309 static inline bool dma_pte_superpage(struct dma_pte *pte)
311 return (pte->val & DMA_PTE_LARGE_PAGE);
314 static inline int first_pte_in_page(struct dma_pte *pte)
316 return !((unsigned long)pte & ~VTD_PAGE_MASK);
320 * This domain is a statically identity mapping domain.
321 * 1. This domain creats a static 1:1 mapping to all usable memory.
322 * 2. It maps to each iommu if successful.
323 * 3. Each iommu mapps to this domain if successful.
325 static struct dmar_domain *si_domain;
326 static int hw_pass_through = 1;
328 /* domain represents a virtual machine, more than one devices
329 * across iommus may be owned in one domain, e.g. kvm guest.
331 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
333 /* si_domain contains mulitple devices */
334 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
337 int id; /* domain id */
338 int nid; /* node id */
339 DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
340 /* bitmap of iommus this domain uses*/
342 struct list_head devices; /* all devices' list */
343 struct iova_domain iovad; /* iova's that belong to this domain */
345 struct dma_pte *pgd; /* virtual address */
346 int gaw; /* max guest address width */
348 /* adjusted guest address width, 0 is level 2 30-bit */
351 int flags; /* flags to find out type of domain */
353 int iommu_coherency;/* indicate coherency of iommu access */
354 int iommu_snooping; /* indicate snooping control feature*/
355 int iommu_count; /* reference count of iommu */
356 int iommu_superpage;/* Level of superpages supported:
357 0 == 4KiB (no superpages), 1 == 2MiB,
358 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
359 spinlock_t iommu_lock; /* protect iommu set in domain */
360 u64 max_addr; /* maximum mapped address */
362 struct iommu_domain domain; /* generic domain data structure for
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368 struct list_head link; /* link to domain siblings */
369 struct list_head global; /* link to global list */
370 u8 bus; /* PCI bus number */
371 u8 devfn; /* PCI devfn number */
372 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
373 struct intel_iommu *iommu; /* IOMMU used by this device */
374 struct dmar_domain *domain; /* pointer to domain */
377 struct dmar_rmrr_unit {
378 struct list_head list; /* list of rmrr units */
379 struct acpi_dmar_header *hdr; /* ACPI header */
380 u64 base_address; /* reserved base address*/
381 u64 end_address; /* reserved end address */
382 struct dmar_dev_scope *devices; /* target devices */
383 int devices_cnt; /* target device count */
386 struct dmar_atsr_unit {
387 struct list_head list; /* list of ATSR units */
388 struct acpi_dmar_header *hdr; /* ACPI header */
389 struct dmar_dev_scope *devices; /* target devices */
390 int devices_cnt; /* target device count */
391 u8 include_all:1; /* include all ports */
394 static LIST_HEAD(dmar_atsr_units);
395 static LIST_HEAD(dmar_rmrr_units);
397 #define for_each_rmrr_units(rmrr) \
398 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
400 static void flush_unmaps_timeout(unsigned long data);
402 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
404 #define HIGH_WATER_MARK 250
405 struct deferred_flush_tables {
407 struct iova *iova[HIGH_WATER_MARK];
408 struct dmar_domain *domain[HIGH_WATER_MARK];
409 struct page *freelist[HIGH_WATER_MARK];
412 static struct deferred_flush_tables *deferred_flush;
414 /* bitmap for indexing intel_iommus */
415 static int g_num_of_iommus;
417 static DEFINE_SPINLOCK(async_umap_flush_lock);
418 static LIST_HEAD(unmaps_to_do);
421 static long list_size;
423 static void domain_exit(struct dmar_domain *domain);
424 static void domain_remove_dev_info(struct dmar_domain *domain);
425 static void domain_remove_one_dev_info(struct dmar_domain *domain,
427 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
429 static int domain_detach_iommu(struct dmar_domain *domain,
430 struct intel_iommu *iommu);
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
453 static const struct iommu_ops intel_iommu_ops;
455 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
456 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
458 return container_of(dom, struct dmar_domain, domain);
461 static int __init intel_iommu_setup(char *str)
466 if (!strncmp(str, "on", 2)) {
468 printk(KERN_INFO "Intel-IOMMU: enabled\n");
469 } else if (!strncmp(str, "off", 3)) {
471 printk(KERN_INFO "Intel-IOMMU: disabled\n");
472 } else if (!strncmp(str, "igfx_off", 8)) {
475 "Intel-IOMMU: disable GFX device mapping\n");
476 } else if (!strncmp(str, "forcedac", 8)) {
478 "Intel-IOMMU: Forcing DAC for PCI devices\n");
480 } else if (!strncmp(str, "strict", 6)) {
482 "Intel-IOMMU: disable batched IOTLB flush\n");
483 intel_iommu_strict = 1;
484 } else if (!strncmp(str, "sp_off", 6)) {
486 "Intel-IOMMU: disable supported super page\n");
487 intel_iommu_superpage = 0;
490 str += strcspn(str, ",");
496 __setup("intel_iommu=", intel_iommu_setup);
498 static struct kmem_cache *iommu_domain_cache;
499 static struct kmem_cache *iommu_devinfo_cache;
501 static inline void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 static inline void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_vm(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
542 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
544 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
545 DOMAIN_FLAG_STATIC_IDENTITY);
548 static inline int domain_pfn_supported(struct dmar_domain *domain,
551 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
553 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
561 sagaw = cap_sagaw(iommu->cap);
562 for (agaw = width_to_agaw(max_gaw);
564 if (test_bit(agaw, &sagaw))
572 * Calculate max SAGAW for each iommu.
574 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
576 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
580 * calculate agaw for each iommu.
581 * "SAGAW" may be different across iommus, use a default agaw, and
582 * get a supported less agaw for iommus that don't support the default agaw.
584 int iommu_calculate_agaw(struct intel_iommu *iommu)
586 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
589 /* This functionin only returns single iommu in a domain */
590 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
594 /* si_domain and vm domain should not get here. */
595 BUG_ON(domain_type_is_vm_or_si(domain));
596 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
597 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
600 return g_iommus[iommu_id];
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
605 struct dmar_drhd_unit *drhd;
606 struct intel_iommu *iommu;
610 domain->iommu_coherency = 1;
612 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
614 if (!ecap_coherent(g_iommus[i]->ecap)) {
615 domain->iommu_coherency = 0;
622 /* No hardware attached; use lowest common denominator */
624 for_each_active_iommu(iommu, drhd) {
625 if (!ecap_coherent(iommu->ecap)) {
626 domain->iommu_coherency = 0;
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
635 struct dmar_drhd_unit *drhd;
636 struct intel_iommu *iommu;
640 for_each_active_iommu(iommu, drhd) {
642 if (!ecap_sc_support(iommu->ecap)) {
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
655 struct dmar_drhd_unit *drhd;
656 struct intel_iommu *iommu;
659 if (!intel_iommu_superpage) {
663 /* set iommu_superpage to the smallest common denominator */
665 for_each_active_iommu(iommu, drhd) {
667 mask &= cap_super_page_val(iommu->cap);
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
680 domain_update_iommu_coherency(domain);
681 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
685 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
687 struct dmar_drhd_unit *drhd = NULL;
688 struct intel_iommu *iommu;
690 struct pci_dev *ptmp, *pdev = NULL;
694 if (dev_is_pci(dev)) {
695 pdev = to_pci_dev(dev);
696 segment = pci_domain_nr(pdev->bus);
697 } else if (has_acpi_companion(dev))
698 dev = &ACPI_COMPANION(dev)->dev;
701 for_each_active_iommu(iommu, drhd) {
702 if (pdev && segment != drhd->segment)
705 for_each_active_dev_scope(drhd->devices,
706 drhd->devices_cnt, i, tmp) {
708 *bus = drhd->devices[i].bus;
709 *devfn = drhd->devices[i].devfn;
713 if (!pdev || !dev_is_pci(tmp))
716 ptmp = to_pci_dev(tmp);
717 if (ptmp->subordinate &&
718 ptmp->subordinate->number <= pdev->bus->number &&
719 ptmp->subordinate->busn_res.end >= pdev->bus->number)
723 if (pdev && drhd->include_all) {
725 *bus = pdev->bus->number;
726 *devfn = pdev->devfn;
737 static void domain_flush_cache(struct dmar_domain *domain,
738 void *addr, int size)
740 if (!domain->iommu_coherency)
741 clflush_cache_range(addr, size);
744 /* Gets context entry for a given bus and devfn */
745 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
748 struct root_entry *root;
749 struct context_entry *context;
750 unsigned long phy_addr;
753 spin_lock_irqsave(&iommu->lock, flags);
754 root = &iommu->root_entry[bus];
755 context = get_context_addr_from_root(root);
757 context = (struct context_entry *)
758 alloc_pgtable_page(iommu->node);
760 spin_unlock_irqrestore(&iommu->lock, flags);
763 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
764 phy_addr = virt_to_phys((void *)context);
765 set_root_value(root, phy_addr);
766 set_root_present(root);
767 __iommu_flush_cache(iommu, root, sizeof(*root));
769 spin_unlock_irqrestore(&iommu->lock, flags);
770 return &context[devfn];
773 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
775 struct root_entry *root;
776 struct context_entry *context;
780 spin_lock_irqsave(&iommu->lock, flags);
781 root = &iommu->root_entry[bus];
782 context = get_context_addr_from_root(root);
787 ret = context_present(&context[devfn]);
789 spin_unlock_irqrestore(&iommu->lock, flags);
793 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
795 struct root_entry *root;
796 struct context_entry *context;
799 spin_lock_irqsave(&iommu->lock, flags);
800 root = &iommu->root_entry[bus];
801 context = get_context_addr_from_root(root);
803 context_clear_entry(&context[devfn]);
804 __iommu_flush_cache(iommu, &context[devfn], \
807 spin_unlock_irqrestore(&iommu->lock, flags);
810 static void free_context_table(struct intel_iommu *iommu)
812 struct root_entry *root;
815 struct context_entry *context;
817 spin_lock_irqsave(&iommu->lock, flags);
818 if (!iommu->root_entry) {
821 for (i = 0; i < ROOT_ENTRY_NR; i++) {
822 root = &iommu->root_entry[i];
823 context = get_context_addr_from_root(root);
825 free_pgtable_page(context);
827 free_pgtable_page(iommu->root_entry);
828 iommu->root_entry = NULL;
830 spin_unlock_irqrestore(&iommu->lock, flags);
833 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
834 unsigned long pfn, int *target_level)
836 struct dma_pte *parent, *pte = NULL;
837 int level = agaw_to_level(domain->agaw);
840 BUG_ON(!domain->pgd);
842 if (!domain_pfn_supported(domain, pfn))
843 /* Address beyond IOMMU's addressing capabilities. */
846 parent = domain->pgd;
851 offset = pfn_level_offset(pfn, level);
852 pte = &parent[offset];
853 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
855 if (level == *target_level)
858 if (!dma_pte_present(pte)) {
861 tmp_page = alloc_pgtable_page(domain->nid);
866 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
867 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
868 if (cmpxchg64(&pte->val, 0ULL, pteval))
869 /* Someone else set it while we were thinking; use theirs. */
870 free_pgtable_page(tmp_page);
872 domain_flush_cache(domain, pte, sizeof(*pte));
877 parent = phys_to_virt(dma_pte_addr(pte));
882 *target_level = level;
888 /* return address's pte at specific level */
889 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
891 int level, int *large_page)
893 struct dma_pte *parent, *pte = NULL;
894 int total = agaw_to_level(domain->agaw);
897 parent = domain->pgd;
898 while (level <= total) {
899 offset = pfn_level_offset(pfn, total);
900 pte = &parent[offset];
904 if (!dma_pte_present(pte)) {
909 if (dma_pte_superpage(pte)) {
914 parent = phys_to_virt(dma_pte_addr(pte));
920 /* clear last level pte, a tlb flush should be followed */
921 static void dma_pte_clear_range(struct dmar_domain *domain,
922 unsigned long start_pfn,
923 unsigned long last_pfn)
925 unsigned int large_page = 1;
926 struct dma_pte *first_pte, *pte;
928 BUG_ON(!domain_pfn_supported(domain, start_pfn));
929 BUG_ON(!domain_pfn_supported(domain, last_pfn));
930 BUG_ON(start_pfn > last_pfn);
932 /* we don't need lock here; nobody else touches the iova range */
935 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
937 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
942 start_pfn += lvl_to_nr_pages(large_page);
944 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
946 domain_flush_cache(domain, first_pte,
947 (void *)pte - (void *)first_pte);
949 } while (start_pfn && start_pfn <= last_pfn);
952 static void dma_pte_free_level(struct dmar_domain *domain, int level,
953 struct dma_pte *pte, unsigned long pfn,
954 unsigned long start_pfn, unsigned long last_pfn)
956 pfn = max(start_pfn, pfn);
957 pte = &pte[pfn_level_offset(pfn, level)];
960 unsigned long level_pfn;
961 struct dma_pte *level_pte;
963 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
966 level_pfn = pfn & level_mask(level - 1);
967 level_pte = phys_to_virt(dma_pte_addr(pte));
970 dma_pte_free_level(domain, level - 1, level_pte,
971 level_pfn, start_pfn, last_pfn);
973 /* If range covers entire pagetable, free it */
974 if (!(start_pfn > level_pfn ||
975 last_pfn < level_pfn + level_size(level) - 1)) {
977 domain_flush_cache(domain, pte, sizeof(*pte));
978 free_pgtable_page(level_pte);
981 pfn += level_size(level);
982 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
985 /* free page table pages. last level pte should already be cleared */
986 static void dma_pte_free_pagetable(struct dmar_domain *domain,
987 unsigned long start_pfn,
988 unsigned long last_pfn)
990 BUG_ON(!domain_pfn_supported(domain, start_pfn));
991 BUG_ON(!domain_pfn_supported(domain, last_pfn));
992 BUG_ON(start_pfn > last_pfn);
994 dma_pte_clear_range(domain, start_pfn, last_pfn);
996 /* We don't need lock here; nobody else touches the iova range */
997 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
998 domain->pgd, 0, start_pfn, last_pfn);
1001 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1002 free_pgtable_page(domain->pgd);
1007 /* When a page at a given level is being unlinked from its parent, we don't
1008 need to *modify* it at all. All we need to do is make a list of all the
1009 pages which can be freed just as soon as we've flushed the IOTLB and we
1010 know the hardware page-walk will no longer touch them.
1011 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1013 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1014 int level, struct dma_pte *pte,
1015 struct page *freelist)
1019 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1020 pg->freelist = freelist;
1026 pte = page_address(pg);
1028 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1029 freelist = dma_pte_list_pagetables(domain, level - 1,
1032 } while (!first_pte_in_page(pte));
1037 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1038 struct dma_pte *pte, unsigned long pfn,
1039 unsigned long start_pfn,
1040 unsigned long last_pfn,
1041 struct page *freelist)
1043 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1045 pfn = max(start_pfn, pfn);
1046 pte = &pte[pfn_level_offset(pfn, level)];
1049 unsigned long level_pfn;
1051 if (!dma_pte_present(pte))
1054 level_pfn = pfn & level_mask(level);
1056 /* If range covers entire pagetable, free it */
1057 if (start_pfn <= level_pfn &&
1058 last_pfn >= level_pfn + level_size(level) - 1) {
1059 /* These suborbinate page tables are going away entirely. Don't
1060 bother to clear them; we're just going to *free* them. */
1061 if (level > 1 && !dma_pte_superpage(pte))
1062 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1068 } else if (level > 1) {
1069 /* Recurse down into a level that isn't *entirely* obsolete */
1070 freelist = dma_pte_clear_level(domain, level - 1,
1071 phys_to_virt(dma_pte_addr(pte)),
1072 level_pfn, start_pfn, last_pfn,
1076 pfn += level_size(level);
1077 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1080 domain_flush_cache(domain, first_pte,
1081 (void *)++last_pte - (void *)first_pte);
1086 /* We can't just free the pages because the IOMMU may still be walking
1087 the page tables, and may have cached the intermediate levels. The
1088 pages can only be freed after the IOTLB flush has been done. */
1089 struct page *domain_unmap(struct dmar_domain *domain,
1090 unsigned long start_pfn,
1091 unsigned long last_pfn)
1093 struct page *freelist = NULL;
1095 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1096 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1097 BUG_ON(start_pfn > last_pfn);
1099 /* we don't need lock here; nobody else touches the iova range */
1100 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1101 domain->pgd, 0, start_pfn, last_pfn, NULL);
1104 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1105 struct page *pgd_page = virt_to_page(domain->pgd);
1106 pgd_page->freelist = freelist;
1107 freelist = pgd_page;
1115 void dma_free_pagelist(struct page *freelist)
1119 while ((pg = freelist)) {
1120 freelist = pg->freelist;
1121 free_pgtable_page(page_address(pg));
1125 /* iommu handling */
1126 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1128 struct root_entry *root;
1129 unsigned long flags;
1131 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1133 pr_err("IOMMU: allocating root entry for %s failed\n",
1138 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1140 spin_lock_irqsave(&iommu->lock, flags);
1141 iommu->root_entry = root;
1142 spin_unlock_irqrestore(&iommu->lock, flags);
1147 static void iommu_set_root_entry(struct intel_iommu *iommu)
1153 addr = iommu->root_entry;
1155 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1156 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1158 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1160 /* Make sure hardware complete it */
1161 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1162 readl, (sts & DMA_GSTS_RTPS), sts);
1164 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1167 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1172 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1175 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1176 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1178 /* Make sure hardware complete it */
1179 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1180 readl, (!(val & DMA_GSTS_WBFS)), val);
1182 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1185 /* return value determine if we need a write buffer flush */
1186 static void __iommu_flush_context(struct intel_iommu *iommu,
1187 u16 did, u16 source_id, u8 function_mask,
1194 case DMA_CCMD_GLOBAL_INVL:
1195 val = DMA_CCMD_GLOBAL_INVL;
1197 case DMA_CCMD_DOMAIN_INVL:
1198 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1200 case DMA_CCMD_DEVICE_INVL:
1201 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1202 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1207 val |= DMA_CCMD_ICC;
1209 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1210 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1212 /* Make sure hardware complete it */
1213 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1214 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1216 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1219 /* return value determine if we need a write buffer flush */
1220 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1221 u64 addr, unsigned int size_order, u64 type)
1223 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1224 u64 val = 0, val_iva = 0;
1228 case DMA_TLB_GLOBAL_FLUSH:
1229 /* global flush doesn't need set IVA_REG */
1230 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1232 case DMA_TLB_DSI_FLUSH:
1233 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1235 case DMA_TLB_PSI_FLUSH:
1236 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1237 /* IH bit is passed in as part of address */
1238 val_iva = size_order | addr;
1243 /* Note: set drain read/write */
1246 * This is probably to be super secure.. Looks like we can
1247 * ignore it without any impact.
1249 if (cap_read_drain(iommu->cap))
1250 val |= DMA_TLB_READ_DRAIN;
1252 if (cap_write_drain(iommu->cap))
1253 val |= DMA_TLB_WRITE_DRAIN;
1255 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1256 /* Note: Only uses first TLB reg currently */
1258 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1259 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1261 /* Make sure hardware complete it */
1262 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1263 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1265 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1267 /* check IOTLB invalidation granularity */
1268 if (DMA_TLB_IAIG(val) == 0)
1269 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1270 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1271 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1272 (unsigned long long)DMA_TLB_IIRG(type),
1273 (unsigned long long)DMA_TLB_IAIG(val));
1276 static struct device_domain_info *
1277 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1281 unsigned long flags;
1282 struct device_domain_info *info;
1283 struct pci_dev *pdev;
1285 if (!ecap_dev_iotlb_support(iommu->ecap))
1291 spin_lock_irqsave(&device_domain_lock, flags);
1292 list_for_each_entry(info, &domain->devices, link)
1293 if (info->iommu == iommu && info->bus == bus &&
1294 info->devfn == devfn) {
1298 spin_unlock_irqrestore(&device_domain_lock, flags);
1300 if (!found || !info->dev || !dev_is_pci(info->dev))
1303 pdev = to_pci_dev(info->dev);
1305 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1308 if (!dmar_find_matched_atsr_unit(pdev))
1314 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1316 if (!info || !dev_is_pci(info->dev))
1319 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1322 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1324 if (!info->dev || !dev_is_pci(info->dev) ||
1325 !pci_ats_enabled(to_pci_dev(info->dev)))
1328 pci_disable_ats(to_pci_dev(info->dev));
1331 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1332 u64 addr, unsigned mask)
1335 unsigned long flags;
1336 struct device_domain_info *info;
1338 spin_lock_irqsave(&device_domain_lock, flags);
1339 list_for_each_entry(info, &domain->devices, link) {
1340 struct pci_dev *pdev;
1341 if (!info->dev || !dev_is_pci(info->dev))
1344 pdev = to_pci_dev(info->dev);
1345 if (!pci_ats_enabled(pdev))
1348 sid = info->bus << 8 | info->devfn;
1349 qdep = pci_ats_queue_depth(pdev);
1350 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1352 spin_unlock_irqrestore(&device_domain_lock, flags);
1355 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1356 unsigned long pfn, unsigned int pages, int ih, int map)
1358 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1359 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1366 * Fallback to domain selective flush if no PSI support or the size is
1368 * PSI requires page size to be 2 ^ x, and the base address is naturally
1369 * aligned to the size
1371 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1372 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1375 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1379 * In caching mode, changes of pages from non-present to present require
1380 * flush. However, device IOTLB doesn't need to be flushed in this case.
1382 if (!cap_caching_mode(iommu->cap) || !map)
1383 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1386 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1389 unsigned long flags;
1391 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1392 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1393 pmen &= ~DMA_PMEN_EPM;
1394 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1396 /* wait for the protected region status bit to clear */
1397 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1398 readl, !(pmen & DMA_PMEN_PRS), pmen);
1400 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1403 static void iommu_enable_translation(struct intel_iommu *iommu)
1406 unsigned long flags;
1408 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1409 iommu->gcmd |= DMA_GCMD_TE;
1410 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1412 /* Make sure hardware complete it */
1413 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1414 readl, (sts & DMA_GSTS_TES), sts);
1416 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1419 static void iommu_disable_translation(struct intel_iommu *iommu)
1424 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1425 iommu->gcmd &= ~DMA_GCMD_TE;
1426 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1428 /* Make sure hardware complete it */
1429 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1430 readl, (!(sts & DMA_GSTS_TES)), sts);
1432 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1436 static int iommu_init_domains(struct intel_iommu *iommu)
1438 unsigned long ndomains;
1439 unsigned long nlongs;
1441 ndomains = cap_ndoms(iommu->cap);
1442 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1443 iommu->seq_id, ndomains);
1444 nlongs = BITS_TO_LONGS(ndomains);
1446 spin_lock_init(&iommu->lock);
1448 /* TBD: there might be 64K domains,
1449 * consider other allocation for future chip
1451 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1452 if (!iommu->domain_ids) {
1453 pr_err("IOMMU%d: allocating domain id array failed\n",
1457 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1459 if (!iommu->domains) {
1460 pr_err("IOMMU%d: allocating domain array failed\n",
1462 kfree(iommu->domain_ids);
1463 iommu->domain_ids = NULL;
1468 * if Caching mode is set, then invalid translations are tagged
1469 * with domainid 0. Hence we need to pre-allocate it.
1471 if (cap_caching_mode(iommu->cap))
1472 set_bit(0, iommu->domain_ids);
1476 static void disable_dmar_iommu(struct intel_iommu *iommu)
1478 struct dmar_domain *domain;
1481 if ((iommu->domains) && (iommu->domain_ids)) {
1482 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1484 * Domain id 0 is reserved for invalid translation
1485 * if hardware supports caching mode.
1487 if (cap_caching_mode(iommu->cap) && i == 0)
1490 domain = iommu->domains[i];
1491 clear_bit(i, iommu->domain_ids);
1492 if (domain_detach_iommu(domain, iommu) == 0 &&
1493 !domain_type_is_vm(domain))
1494 domain_exit(domain);
1498 if (iommu->gcmd & DMA_GCMD_TE)
1499 iommu_disable_translation(iommu);
1502 static void free_dmar_iommu(struct intel_iommu *iommu)
1504 if ((iommu->domains) && (iommu->domain_ids)) {
1505 kfree(iommu->domains);
1506 kfree(iommu->domain_ids);
1507 iommu->domains = NULL;
1508 iommu->domain_ids = NULL;
1511 g_iommus[iommu->seq_id] = NULL;
1513 /* free context mapping */
1514 free_context_table(iommu);
1517 static struct dmar_domain *alloc_domain(int flags)
1519 /* domain id for virtual machine, it won't be set in context */
1520 static atomic_t vm_domid = ATOMIC_INIT(0);
1521 struct dmar_domain *domain;
1523 domain = alloc_domain_mem();
1527 memset(domain, 0, sizeof(*domain));
1529 domain->flags = flags;
1530 spin_lock_init(&domain->iommu_lock);
1531 INIT_LIST_HEAD(&domain->devices);
1532 if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1533 domain->id = atomic_inc_return(&vm_domid);
1538 static int __iommu_attach_domain(struct dmar_domain *domain,
1539 struct intel_iommu *iommu)
1542 unsigned long ndomains;
1544 ndomains = cap_ndoms(iommu->cap);
1545 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1546 if (num < ndomains) {
1547 set_bit(num, iommu->domain_ids);
1548 iommu->domains[num] = domain;
1556 static int iommu_attach_domain(struct dmar_domain *domain,
1557 struct intel_iommu *iommu)
1560 unsigned long flags;
1562 spin_lock_irqsave(&iommu->lock, flags);
1563 num = __iommu_attach_domain(domain, iommu);
1564 spin_unlock_irqrestore(&iommu->lock, flags);
1566 pr_err("IOMMU: no free domain ids\n");
1571 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1572 struct intel_iommu *iommu)
1575 unsigned long ndomains;
1577 ndomains = cap_ndoms(iommu->cap);
1578 for_each_set_bit(num, iommu->domain_ids, ndomains)
1579 if (iommu->domains[num] == domain)
1582 return __iommu_attach_domain(domain, iommu);
1585 static void iommu_detach_domain(struct dmar_domain *domain,
1586 struct intel_iommu *iommu)
1588 unsigned long flags;
1591 spin_lock_irqsave(&iommu->lock, flags);
1592 if (domain_type_is_vm_or_si(domain)) {
1593 ndomains = cap_ndoms(iommu->cap);
1594 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1595 if (iommu->domains[num] == domain) {
1596 clear_bit(num, iommu->domain_ids);
1597 iommu->domains[num] = NULL;
1602 clear_bit(domain->id, iommu->domain_ids);
1603 iommu->domains[domain->id] = NULL;
1605 spin_unlock_irqrestore(&iommu->lock, flags);
1608 static void domain_attach_iommu(struct dmar_domain *domain,
1609 struct intel_iommu *iommu)
1611 unsigned long flags;
1613 spin_lock_irqsave(&domain->iommu_lock, flags);
1614 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1615 domain->iommu_count++;
1616 if (domain->iommu_count == 1)
1617 domain->nid = iommu->node;
1618 domain_update_iommu_cap(domain);
1620 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1623 static int domain_detach_iommu(struct dmar_domain *domain,
1624 struct intel_iommu *iommu)
1626 unsigned long flags;
1627 int count = INT_MAX;
1629 spin_lock_irqsave(&domain->iommu_lock, flags);
1630 if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1631 count = --domain->iommu_count;
1632 domain_update_iommu_cap(domain);
1634 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1639 static struct iova_domain reserved_iova_list;
1640 static struct lock_class_key reserved_rbtree_key;
1642 static int dmar_init_reserved_ranges(void)
1644 struct pci_dev *pdev = NULL;
1648 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1651 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1652 &reserved_rbtree_key);
1654 /* IOAPIC ranges shouldn't be accessed by DMA */
1655 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1656 IOVA_PFN(IOAPIC_RANGE_END));
1658 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1662 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1663 for_each_pci_dev(pdev) {
1666 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1667 r = &pdev->resource[i];
1668 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1670 iova = reserve_iova(&reserved_iova_list,
1674 printk(KERN_ERR "Reserve iova failed\n");
1682 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1684 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1687 static inline int guestwidth_to_adjustwidth(int gaw)
1690 int r = (gaw - 12) % 9;
1701 static int domain_init(struct dmar_domain *domain, int guest_width)
1703 struct intel_iommu *iommu;
1704 int adjust_width, agaw;
1705 unsigned long sagaw;
1707 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1709 domain_reserve_special_ranges(domain);
1711 /* calculate AGAW */
1712 iommu = domain_get_iommu(domain);
1713 if (guest_width > cap_mgaw(iommu->cap))
1714 guest_width = cap_mgaw(iommu->cap);
1715 domain->gaw = guest_width;
1716 adjust_width = guestwidth_to_adjustwidth(guest_width);
1717 agaw = width_to_agaw(adjust_width);
1718 sagaw = cap_sagaw(iommu->cap);
1719 if (!test_bit(agaw, &sagaw)) {
1720 /* hardware doesn't support it, choose a bigger one */
1721 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1722 agaw = find_next_bit(&sagaw, 5, agaw);
1726 domain->agaw = agaw;
1728 if (ecap_coherent(iommu->ecap))
1729 domain->iommu_coherency = 1;
1731 domain->iommu_coherency = 0;
1733 if (ecap_sc_support(iommu->ecap))
1734 domain->iommu_snooping = 1;
1736 domain->iommu_snooping = 0;
1738 if (intel_iommu_superpage)
1739 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1741 domain->iommu_superpage = 0;
1743 domain->nid = iommu->node;
1745 /* always allocate the top pgd */
1746 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1749 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1753 static void domain_exit(struct dmar_domain *domain)
1755 struct page *freelist = NULL;
1758 /* Domain 0 is reserved, so dont process it */
1762 /* Flush any lazy unmaps that may reference this domain */
1763 if (!intel_iommu_strict)
1764 flush_unmaps_timeout(0);
1766 /* remove associated devices */
1767 domain_remove_dev_info(domain);
1770 put_iova_domain(&domain->iovad);
1772 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1774 /* clear attached or cached domains */
1776 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus)
1777 iommu_detach_domain(domain, g_iommus[i]);
1780 dma_free_pagelist(freelist);
1782 free_domain_mem(domain);
1785 static int domain_context_mapping_one(struct dmar_domain *domain,
1786 struct intel_iommu *iommu,
1787 u8 bus, u8 devfn, int translation)
1789 struct context_entry *context;
1790 unsigned long flags;
1791 struct dma_pte *pgd;
1794 struct device_domain_info *info = NULL;
1796 pr_debug("Set context mapping for %02x:%02x.%d\n",
1797 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1799 BUG_ON(!domain->pgd);
1800 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1801 translation != CONTEXT_TT_MULTI_LEVEL);
1803 context = device_to_context_entry(iommu, bus, devfn);
1806 spin_lock_irqsave(&iommu->lock, flags);
1807 if (context_present(context)) {
1808 spin_unlock_irqrestore(&iommu->lock, flags);
1815 if (domain_type_is_vm_or_si(domain)) {
1816 if (domain_type_is_vm(domain)) {
1817 id = iommu_attach_vm_domain(domain, iommu);
1819 spin_unlock_irqrestore(&iommu->lock, flags);
1820 pr_err("IOMMU: no free domain ids\n");
1825 /* Skip top levels of page tables for
1826 * iommu which has less agaw than default.
1827 * Unnecessary for PT mode.
1829 if (translation != CONTEXT_TT_PASS_THROUGH) {
1830 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1831 pgd = phys_to_virt(dma_pte_addr(pgd));
1832 if (!dma_pte_present(pgd)) {
1833 spin_unlock_irqrestore(&iommu->lock, flags);
1840 context_set_domain_id(context, id);
1842 if (translation != CONTEXT_TT_PASS_THROUGH) {
1843 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1844 translation = info ? CONTEXT_TT_DEV_IOTLB :
1845 CONTEXT_TT_MULTI_LEVEL;
1848 * In pass through mode, AW must be programmed to indicate the largest
1849 * AGAW value supported by hardware. And ASR is ignored by hardware.
1851 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1852 context_set_address_width(context, iommu->msagaw);
1854 context_set_address_root(context, virt_to_phys(pgd));
1855 context_set_address_width(context, iommu->agaw);
1858 context_set_translation_type(context, translation);
1859 context_set_fault_enable(context);
1860 context_set_present(context);
1861 domain_flush_cache(domain, context, sizeof(*context));
1864 * It's a non-present to present mapping. If hardware doesn't cache
1865 * non-present entry we only need to flush the write-buffer. If the
1866 * _does_ cache non-present entries, then it does so in the special
1867 * domain #0, which we have to flush:
1869 if (cap_caching_mode(iommu->cap)) {
1870 iommu->flush.flush_context(iommu, 0,
1871 (((u16)bus) << 8) | devfn,
1872 DMA_CCMD_MASK_NOBIT,
1873 DMA_CCMD_DEVICE_INVL);
1874 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1876 iommu_flush_write_buffer(iommu);
1878 iommu_enable_dev_iotlb(info);
1879 spin_unlock_irqrestore(&iommu->lock, flags);
1881 domain_attach_iommu(domain, iommu);
1886 struct domain_context_mapping_data {
1887 struct dmar_domain *domain;
1888 struct intel_iommu *iommu;
1892 static int domain_context_mapping_cb(struct pci_dev *pdev,
1893 u16 alias, void *opaque)
1895 struct domain_context_mapping_data *data = opaque;
1897 return domain_context_mapping_one(data->domain, data->iommu,
1898 PCI_BUS_NUM(alias), alias & 0xff,
1903 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1906 struct intel_iommu *iommu;
1908 struct domain_context_mapping_data data;
1910 iommu = device_to_iommu(dev, &bus, &devfn);
1914 if (!dev_is_pci(dev))
1915 return domain_context_mapping_one(domain, iommu, bus, devfn,
1918 data.domain = domain;
1920 data.translation = translation;
1922 return pci_for_each_dma_alias(to_pci_dev(dev),
1923 &domain_context_mapping_cb, &data);
1926 static int domain_context_mapped_cb(struct pci_dev *pdev,
1927 u16 alias, void *opaque)
1929 struct intel_iommu *iommu = opaque;
1931 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1934 static int domain_context_mapped(struct device *dev)
1936 struct intel_iommu *iommu;
1939 iommu = device_to_iommu(dev, &bus, &devfn);
1943 if (!dev_is_pci(dev))
1944 return device_context_mapped(iommu, bus, devfn);
1946 return !pci_for_each_dma_alias(to_pci_dev(dev),
1947 domain_context_mapped_cb, iommu);
1950 /* Returns a number of VTD pages, but aligned to MM page size */
1951 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1954 host_addr &= ~PAGE_MASK;
1955 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1958 /* Return largest possible superpage level for a given mapping */
1959 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1960 unsigned long iov_pfn,
1961 unsigned long phy_pfn,
1962 unsigned long pages)
1964 int support, level = 1;
1965 unsigned long pfnmerge;
1967 support = domain->iommu_superpage;
1969 /* To use a large page, the virtual *and* physical addresses
1970 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1971 of them will mean we have to use smaller pages. So just
1972 merge them and check both at once. */
1973 pfnmerge = iov_pfn | phy_pfn;
1975 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1976 pages >>= VTD_STRIDE_SHIFT;
1979 pfnmerge >>= VTD_STRIDE_SHIFT;
1986 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1987 struct scatterlist *sg, unsigned long phys_pfn,
1988 unsigned long nr_pages, int prot)
1990 struct dma_pte *first_pte = NULL, *pte = NULL;
1991 phys_addr_t uninitialized_var(pteval);
1992 unsigned long sg_res = 0;
1993 unsigned int largepage_lvl = 0;
1994 unsigned long lvl_pages = 0;
1996 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
1998 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2001 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2005 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2008 while (nr_pages > 0) {
2012 sg_res = aligned_nrpages(sg->offset, sg->length);
2013 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2014 sg->dma_length = sg->length;
2015 pteval = page_to_phys(sg_page(sg)) | prot;
2016 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2020 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2022 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2025 /* It is large page*/
2026 if (largepage_lvl > 1) {
2027 pteval |= DMA_PTE_LARGE_PAGE;
2028 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2030 * Ensure that old small page tables are
2031 * removed to make room for superpage,
2034 dma_pte_free_pagetable(domain, iov_pfn,
2035 iov_pfn + lvl_pages - 1);
2037 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2041 /* We don't need lock here, nobody else
2042 * touches the iova range
2044 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2046 static int dumps = 5;
2047 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2048 iov_pfn, tmp, (unsigned long long)pteval);
2051 debug_dma_dump_mappings(NULL);
2056 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2058 BUG_ON(nr_pages < lvl_pages);
2059 BUG_ON(sg_res < lvl_pages);
2061 nr_pages -= lvl_pages;
2062 iov_pfn += lvl_pages;
2063 phys_pfn += lvl_pages;
2064 pteval += lvl_pages * VTD_PAGE_SIZE;
2065 sg_res -= lvl_pages;
2067 /* If the next PTE would be the first in a new page, then we
2068 need to flush the cache on the entries we've just written.
2069 And then we'll need to recalculate 'pte', so clear it and
2070 let it get set again in the if (!pte) block above.
2072 If we're done (!nr_pages) we need to flush the cache too.
2074 Also if we've been setting superpages, we may need to
2075 recalculate 'pte' and switch back to smaller pages for the
2076 end of the mapping, if the trailing size is not enough to
2077 use another superpage (i.e. sg_res < lvl_pages). */
2079 if (!nr_pages || first_pte_in_page(pte) ||
2080 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2081 domain_flush_cache(domain, first_pte,
2082 (void *)pte - (void *)first_pte);
2086 if (!sg_res && nr_pages)
2092 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2093 struct scatterlist *sg, unsigned long nr_pages,
2096 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2099 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2100 unsigned long phys_pfn, unsigned long nr_pages,
2103 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2106 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2111 clear_context_table(iommu, bus, devfn);
2112 iommu->flush.flush_context(iommu, 0, 0, 0,
2113 DMA_CCMD_GLOBAL_INVL);
2114 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2117 static inline void unlink_domain_info(struct device_domain_info *info)
2119 assert_spin_locked(&device_domain_lock);
2120 list_del(&info->link);
2121 list_del(&info->global);
2123 info->dev->archdata.iommu = NULL;
2126 static void domain_remove_dev_info(struct dmar_domain *domain)
2128 struct device_domain_info *info, *tmp;
2129 unsigned long flags;
2131 spin_lock_irqsave(&device_domain_lock, flags);
2132 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2133 unlink_domain_info(info);
2134 spin_unlock_irqrestore(&device_domain_lock, flags);
2136 iommu_disable_dev_iotlb(info);
2137 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2139 if (domain_type_is_vm(domain)) {
2140 iommu_detach_dependent_devices(info->iommu, info->dev);
2141 domain_detach_iommu(domain, info->iommu);
2144 free_devinfo_mem(info);
2145 spin_lock_irqsave(&device_domain_lock, flags);
2147 spin_unlock_irqrestore(&device_domain_lock, flags);
2152 * Note: we use struct device->archdata.iommu stores the info
2154 static struct dmar_domain *find_domain(struct device *dev)
2156 struct device_domain_info *info;
2158 /* No lock here, assumes no domain exit in normal case */
2159 info = dev->archdata.iommu;
2161 return info->domain;
2165 static inline struct device_domain_info *
2166 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2168 struct device_domain_info *info;
2170 list_for_each_entry(info, &device_domain_list, global)
2171 if (info->iommu->segment == segment && info->bus == bus &&
2172 info->devfn == devfn)
2178 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2181 struct dmar_domain *domain)
2183 struct dmar_domain *found = NULL;
2184 struct device_domain_info *info;
2185 unsigned long flags;
2187 info = alloc_devinfo_mem();
2192 info->devfn = devfn;
2194 info->domain = domain;
2195 info->iommu = iommu;
2197 spin_lock_irqsave(&device_domain_lock, flags);
2199 found = find_domain(dev);
2201 struct device_domain_info *info2;
2202 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2204 found = info2->domain;
2207 spin_unlock_irqrestore(&device_domain_lock, flags);
2208 free_devinfo_mem(info);
2209 /* Caller must free the original domain */
2213 list_add(&info->link, &domain->devices);
2214 list_add(&info->global, &device_domain_list);
2216 dev->archdata.iommu = info;
2217 spin_unlock_irqrestore(&device_domain_lock, flags);
2222 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2224 *(u16 *)opaque = alias;
2228 /* domain is initialized */
2229 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2231 struct dmar_domain *domain, *tmp;
2232 struct intel_iommu *iommu;
2233 struct device_domain_info *info;
2235 unsigned long flags;
2238 domain = find_domain(dev);
2242 iommu = device_to_iommu(dev, &bus, &devfn);
2246 if (dev_is_pci(dev)) {
2247 struct pci_dev *pdev = to_pci_dev(dev);
2249 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2251 spin_lock_irqsave(&device_domain_lock, flags);
2252 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2253 PCI_BUS_NUM(dma_alias),
2256 iommu = info->iommu;
2257 domain = info->domain;
2259 spin_unlock_irqrestore(&device_domain_lock, flags);
2261 /* DMA alias already has a domain, uses it */
2266 /* Allocate and initialize new domain for the device */
2267 domain = alloc_domain(0);
2270 domain->id = iommu_attach_domain(domain, iommu);
2271 if (domain->id < 0) {
2272 free_domain_mem(domain);
2275 domain_attach_iommu(domain, iommu);
2276 if (domain_init(domain, gaw)) {
2277 domain_exit(domain);
2281 /* register PCI DMA alias device */
2282 if (dev_is_pci(dev)) {
2283 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2284 dma_alias & 0xff, NULL, domain);
2286 if (!tmp || tmp != domain) {
2287 domain_exit(domain);
2296 tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2298 if (!tmp || tmp != domain) {
2299 domain_exit(domain);
2306 static int iommu_identity_mapping;
2307 #define IDENTMAP_ALL 1
2308 #define IDENTMAP_GFX 2
2309 #define IDENTMAP_AZALIA 4
2311 static int iommu_domain_identity_map(struct dmar_domain *domain,
2312 unsigned long long start,
2313 unsigned long long end)
2315 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2316 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2318 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2319 dma_to_mm_pfn(last_vpfn))) {
2320 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2324 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2325 start, end, domain->id);
2327 * RMRR range might have overlap with physical memory range,
2330 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2332 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2333 last_vpfn - first_vpfn + 1,
2334 DMA_PTE_READ|DMA_PTE_WRITE);
2337 static int iommu_prepare_identity_map(struct device *dev,
2338 unsigned long long start,
2339 unsigned long long end)
2341 struct dmar_domain *domain;
2344 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2348 /* For _hardware_ passthrough, don't bother. But for software
2349 passthrough, we do it anyway -- it may indicate a memory
2350 range which is reserved in E820, so which didn't get set
2351 up to start with in si_domain */
2352 if (domain == si_domain && hw_pass_through) {
2353 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2354 dev_name(dev), start, end);
2359 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2360 dev_name(dev), start, end);
2363 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2364 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2365 dmi_get_system_info(DMI_BIOS_VENDOR),
2366 dmi_get_system_info(DMI_BIOS_VERSION),
2367 dmi_get_system_info(DMI_PRODUCT_VERSION));
2372 if (end >> agaw_to_width(domain->agaw)) {
2373 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2374 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2375 agaw_to_width(domain->agaw),
2376 dmi_get_system_info(DMI_BIOS_VENDOR),
2377 dmi_get_system_info(DMI_BIOS_VERSION),
2378 dmi_get_system_info(DMI_PRODUCT_VERSION));
2383 ret = iommu_domain_identity_map(domain, start, end);
2387 /* context entry init */
2388 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2395 domain_exit(domain);
2399 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2402 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2404 return iommu_prepare_identity_map(dev, rmrr->base_address,
2408 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2409 static inline void iommu_prepare_isa(void)
2411 struct pci_dev *pdev;
2414 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2418 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2419 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2422 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2423 "floppy might not work\n");
2428 static inline void iommu_prepare_isa(void)
2432 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2434 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2436 static int __init si_domain_init(int hw)
2438 struct dmar_drhd_unit *drhd;
2439 struct intel_iommu *iommu;
2443 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2447 for_each_active_iommu(iommu, drhd) {
2448 ret = iommu_attach_domain(si_domain, iommu);
2450 domain_exit(si_domain);
2453 si_domain->id = ret;
2455 } else if (si_domain->id != ret) {
2456 domain_exit(si_domain);
2459 domain_attach_iommu(si_domain, iommu);
2462 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2463 domain_exit(si_domain);
2467 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2473 for_each_online_node(nid) {
2474 unsigned long start_pfn, end_pfn;
2477 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2478 ret = iommu_domain_identity_map(si_domain,
2479 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2488 static int identity_mapping(struct device *dev)
2490 struct device_domain_info *info;
2492 if (likely(!iommu_identity_mapping))
2495 info = dev->archdata.iommu;
2496 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2497 return (info->domain == si_domain);
2502 static int domain_add_dev_info(struct dmar_domain *domain,
2503 struct device *dev, int translation)
2505 struct dmar_domain *ndomain;
2506 struct intel_iommu *iommu;
2510 iommu = device_to_iommu(dev, &bus, &devfn);
2514 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2515 if (ndomain != domain)
2518 ret = domain_context_mapping(domain, dev, translation);
2520 domain_remove_one_dev_info(domain, dev);
2527 static bool device_has_rmrr(struct device *dev)
2529 struct dmar_rmrr_unit *rmrr;
2534 for_each_rmrr_units(rmrr) {
2536 * Return TRUE if this RMRR contains the device that
2539 for_each_active_dev_scope(rmrr->devices,
2540 rmrr->devices_cnt, i, tmp)
2551 * There are a couple cases where we need to restrict the functionality of
2552 * devices associated with RMRRs. The first is when evaluating a device for
2553 * identity mapping because problems exist when devices are moved in and out
2554 * of domains and their respective RMRR information is lost. This means that
2555 * a device with associated RMRRs will never be in a "passthrough" domain.
2556 * The second is use of the device through the IOMMU API. This interface
2557 * expects to have full control of the IOVA space for the device. We cannot
2558 * satisfy both the requirement that RMRR access is maintained and have an
2559 * unencumbered IOVA space. We also have no ability to quiesce the device's
2560 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2561 * We therefore prevent devices associated with an RMRR from participating in
2562 * the IOMMU API, which eliminates them from device assignment.
2564 * In both cases we assume that PCI USB devices with RMRRs have them largely
2565 * for historical reasons and that the RMRR space is not actively used post
2566 * boot. This exclusion may change if vendors begin to abuse it.
2568 static bool device_is_rmrr_locked(struct device *dev)
2570 if (!device_has_rmrr(dev))
2573 if (dev_is_pci(dev)) {
2574 struct pci_dev *pdev = to_pci_dev(dev);
2576 if ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
2583 static int iommu_should_identity_map(struct device *dev, int startup)
2586 if (dev_is_pci(dev)) {
2587 struct pci_dev *pdev = to_pci_dev(dev);
2589 if (device_is_rmrr_locked(dev))
2592 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2595 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2598 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2602 * We want to start off with all devices in the 1:1 domain, and
2603 * take them out later if we find they can't access all of memory.
2605 * However, we can't do this for PCI devices behind bridges,
2606 * because all PCI devices behind the same bridge will end up
2607 * with the same source-id on their transactions.
2609 * Practically speaking, we can't change things around for these
2610 * devices at run-time, because we can't be sure there'll be no
2611 * DMA transactions in flight for any of their siblings.
2613 * So PCI devices (unless they're on the root bus) as well as
2614 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2615 * the 1:1 domain, just in _case_ one of their siblings turns out
2616 * not to be able to map all of memory.
2618 if (!pci_is_pcie(pdev)) {
2619 if (!pci_is_root_bus(pdev->bus))
2621 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2623 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2626 if (device_has_rmrr(dev))
2631 * At boot time, we don't yet know if devices will be 64-bit capable.
2632 * Assume that they will — if they turn out not to be, then we can
2633 * take them out of the 1:1 domain later.
2637 * If the device's dma_mask is less than the system's memory
2638 * size then this is not a candidate for identity mapping.
2640 u64 dma_mask = *dev->dma_mask;
2642 if (dev->coherent_dma_mask &&
2643 dev->coherent_dma_mask < dma_mask)
2644 dma_mask = dev->coherent_dma_mask;
2646 return dma_mask >= dma_get_required_mask(dev);
2652 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2656 if (!iommu_should_identity_map(dev, 1))
2659 ret = domain_add_dev_info(si_domain, dev,
2660 hw ? CONTEXT_TT_PASS_THROUGH :
2661 CONTEXT_TT_MULTI_LEVEL);
2663 pr_info("IOMMU: %s identity mapping for device %s\n",
2664 hw ? "hardware" : "software", dev_name(dev));
2665 else if (ret == -ENODEV)
2666 /* device not associated with an iommu */
2673 static int __init iommu_prepare_static_identity_mapping(int hw)
2675 struct pci_dev *pdev = NULL;
2676 struct dmar_drhd_unit *drhd;
2677 struct intel_iommu *iommu;
2682 ret = si_domain_init(hw);
2686 for_each_pci_dev(pdev) {
2687 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2692 for_each_active_iommu(iommu, drhd)
2693 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2694 struct acpi_device_physical_node *pn;
2695 struct acpi_device *adev;
2697 if (dev->bus != &acpi_bus_type)
2700 adev= to_acpi_device(dev);
2701 mutex_lock(&adev->physical_node_lock);
2702 list_for_each_entry(pn, &adev->physical_node_list, node) {
2703 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2707 mutex_unlock(&adev->physical_node_lock);
2715 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2718 * Start from the sane iommu hardware state.
2719 * If the queued invalidation is already initialized by us
2720 * (for example, while enabling interrupt-remapping) then
2721 * we got the things already rolling from a sane state.
2725 * Clear any previous faults.
2727 dmar_fault(-1, iommu);
2729 * Disable queued invalidation if supported and already enabled
2730 * before OS handover.
2732 dmar_disable_qi(iommu);
2735 if (dmar_enable_qi(iommu)) {
2737 * Queued Invalidate not enabled, use Register Based Invalidate
2739 iommu->flush.flush_context = __iommu_flush_context;
2740 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2741 pr_info("IOMMU: %s using Register based invalidation\n",
2744 iommu->flush.flush_context = qi_flush_context;
2745 iommu->flush.flush_iotlb = qi_flush_iotlb;
2746 pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2750 static int __init init_dmars(void)
2752 struct dmar_drhd_unit *drhd;
2753 struct dmar_rmrr_unit *rmrr;
2755 struct intel_iommu *iommu;
2761 * initialize and program root entry to not present
2764 for_each_drhd_unit(drhd) {
2766 * lock not needed as this is only incremented in the single
2767 * threaded kernel __init code path all other access are read
2770 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2774 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2775 DMAR_UNITS_SUPPORTED);
2778 /* Preallocate enough resources for IOMMU hot-addition */
2779 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2780 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2782 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2785 printk(KERN_ERR "Allocating global iommu array failed\n");
2790 deferred_flush = kzalloc(g_num_of_iommus *
2791 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2792 if (!deferred_flush) {
2797 for_each_active_iommu(iommu, drhd) {
2798 g_iommus[iommu->seq_id] = iommu;
2800 ret = iommu_init_domains(iommu);
2806 * we could share the same root & context tables
2807 * among all IOMMU's. Need to Split it later.
2809 ret = iommu_alloc_root_entry(iommu);
2812 if (!ecap_pass_through(iommu->ecap))
2813 hw_pass_through = 0;
2816 for_each_active_iommu(iommu, drhd)
2817 intel_iommu_init_qi(iommu);
2819 if (iommu_pass_through)
2820 iommu_identity_mapping |= IDENTMAP_ALL;
2822 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2823 iommu_identity_mapping |= IDENTMAP_GFX;
2826 check_tylersburg_isoch();
2829 * If pass through is not set or not enabled, setup context entries for
2830 * identity mappings for rmrr, gfx, and isa and may fall back to static
2831 * identity mapping if iommu_identity_mapping is set.
2833 if (iommu_identity_mapping) {
2834 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2836 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2842 * for each dev attached to rmrr
2844 * locate drhd for dev, alloc domain for dev
2845 * allocate free domain
2846 * allocate page table entries for rmrr
2847 * if context not allocated for bus
2848 * allocate and init context
2849 * set present in root table for this bus
2850 * init context with domain, translation etc
2854 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2855 for_each_rmrr_units(rmrr) {
2856 /* some BIOS lists non-exist devices in DMAR table. */
2857 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2859 ret = iommu_prepare_rmrr_dev(rmrr, dev);
2862 "IOMMU: mapping reserved region failed\n");
2866 iommu_prepare_isa();
2871 * global invalidate context cache
2872 * global invalidate iotlb
2873 * enable translation
2875 for_each_iommu(iommu, drhd) {
2876 if (drhd->ignored) {
2878 * we always have to disable PMRs or DMA may fail on
2882 iommu_disable_protect_mem_regions(iommu);
2886 iommu_flush_write_buffer(iommu);
2888 ret = dmar_set_interrupt(iommu);
2892 iommu_set_root_entry(iommu);
2894 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2895 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2896 iommu_enable_translation(iommu);
2897 iommu_disable_protect_mem_regions(iommu);
2903 for_each_active_iommu(iommu, drhd) {
2904 disable_dmar_iommu(iommu);
2905 free_dmar_iommu(iommu);
2907 kfree(deferred_flush);
2914 /* This takes a number of _MM_ pages, not VTD pages */
2915 static struct iova *intel_alloc_iova(struct device *dev,
2916 struct dmar_domain *domain,
2917 unsigned long nrpages, uint64_t dma_mask)
2919 struct iova *iova = NULL;
2921 /* Restrict dma_mask to the width that the iommu can handle */
2922 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2924 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2926 * First try to allocate an io virtual address in
2927 * DMA_BIT_MASK(32) and if that fails then try allocating
2930 iova = alloc_iova(&domain->iovad, nrpages,
2931 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2935 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2936 if (unlikely(!iova)) {
2937 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2938 nrpages, dev_name(dev));
2945 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2947 struct dmar_domain *domain;
2950 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2952 printk(KERN_ERR "Allocating domain for %s failed",
2957 /* make sure context mapping is ok */
2958 if (unlikely(!domain_context_mapped(dev))) {
2959 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2961 printk(KERN_ERR "Domain context map for %s failed",
2970 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2972 struct device_domain_info *info;
2974 /* No lock here, assumes no domain exit in normal case */
2975 info = dev->archdata.iommu;
2977 return info->domain;
2979 return __get_valid_domain_for_dev(dev);
2982 static int iommu_dummy(struct device *dev)
2984 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2987 /* Check if the dev needs to go through non-identity map and unmap process.*/
2988 static int iommu_no_mapping(struct device *dev)
2992 if (iommu_dummy(dev))
2995 if (!iommu_identity_mapping)
2998 found = identity_mapping(dev);
3000 if (iommu_should_identity_map(dev, 0))
3004 * 32 bit DMA is removed from si_domain and fall back
3005 * to non-identity mapping.
3007 domain_remove_one_dev_info(si_domain, dev);
3008 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3014 * In case of a detached 64 bit DMA device from vm, the device
3015 * is put into si_domain for identity mapping.
3017 if (iommu_should_identity_map(dev, 0)) {
3019 ret = domain_add_dev_info(si_domain, dev,
3021 CONTEXT_TT_PASS_THROUGH :
3022 CONTEXT_TT_MULTI_LEVEL);
3024 printk(KERN_INFO "64bit %s uses identity mapping\n",
3034 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3035 size_t size, int dir, u64 dma_mask)
3037 struct dmar_domain *domain;
3038 phys_addr_t start_paddr;
3042 struct intel_iommu *iommu;
3043 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3045 BUG_ON(dir == DMA_NONE);
3047 if (iommu_no_mapping(dev))
3050 domain = get_valid_domain_for_dev(dev);
3054 iommu = domain_get_iommu(domain);
3055 size = aligned_nrpages(paddr, size);
3057 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3062 * Check if DMAR supports zero-length reads on write only
3065 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3066 !cap_zlr(iommu->cap))
3067 prot |= DMA_PTE_READ;
3068 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3069 prot |= DMA_PTE_WRITE;
3071 * paddr - (paddr + size) might be partial page, we should map the whole
3072 * page. Note: if two part of one page are separately mapped, we
3073 * might have two guest_addr mapping to the same host paddr, but this
3074 * is not a big problem
3076 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3077 mm_to_dma_pfn(paddr_pfn), size, prot);
3081 /* it's a non-present to present mapping. Only flush if caching mode */
3082 if (cap_caching_mode(iommu->cap))
3083 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3085 iommu_flush_write_buffer(iommu);
3087 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3088 start_paddr += paddr & ~PAGE_MASK;
3093 __free_iova(&domain->iovad, iova);
3094 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3095 dev_name(dev), size, (unsigned long long)paddr, dir);
3099 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3100 unsigned long offset, size_t size,
3101 enum dma_data_direction dir,
3102 struct dma_attrs *attrs)
3104 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3105 dir, *dev->dma_mask);
3108 static void flush_unmaps(void)
3114 /* just flush them all */
3115 for (i = 0; i < g_num_of_iommus; i++) {
3116 struct intel_iommu *iommu = g_iommus[i];
3120 if (!deferred_flush[i].next)
3123 /* In caching mode, global flushes turn emulation expensive */
3124 if (!cap_caching_mode(iommu->cap))
3125 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3126 DMA_TLB_GLOBAL_FLUSH);
3127 for (j = 0; j < deferred_flush[i].next; j++) {
3129 struct iova *iova = deferred_flush[i].iova[j];
3130 struct dmar_domain *domain = deferred_flush[i].domain[j];
3132 /* On real hardware multiple invalidations are expensive */
3133 if (cap_caching_mode(iommu->cap))
3134 iommu_flush_iotlb_psi(iommu, domain->id,
3135 iova->pfn_lo, iova_size(iova),
3136 !deferred_flush[i].freelist[j], 0);
3138 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3139 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3140 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3142 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3143 if (deferred_flush[i].freelist[j])
3144 dma_free_pagelist(deferred_flush[i].freelist[j]);
3146 deferred_flush[i].next = 0;
3152 static void flush_unmaps_timeout(unsigned long data)
3154 unsigned long flags;
3156 spin_lock_irqsave(&async_umap_flush_lock, flags);
3158 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3161 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3163 unsigned long flags;
3165 struct intel_iommu *iommu;
3167 spin_lock_irqsave(&async_umap_flush_lock, flags);
3168 if (list_size == HIGH_WATER_MARK)
3171 iommu = domain_get_iommu(dom);
3172 iommu_id = iommu->seq_id;
3174 next = deferred_flush[iommu_id].next;
3175 deferred_flush[iommu_id].domain[next] = dom;
3176 deferred_flush[iommu_id].iova[next] = iova;
3177 deferred_flush[iommu_id].freelist[next] = freelist;
3178 deferred_flush[iommu_id].next++;
3181 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3185 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3188 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3190 struct dmar_domain *domain;
3191 unsigned long start_pfn, last_pfn;
3193 struct intel_iommu *iommu;
3194 struct page *freelist;
3196 if (iommu_no_mapping(dev))
3199 domain = find_domain(dev);
3202 iommu = domain_get_iommu(domain);
3204 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3205 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3206 (unsigned long long)dev_addr))
3209 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3210 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3212 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3213 dev_name(dev), start_pfn, last_pfn);
3215 freelist = domain_unmap(domain, start_pfn, last_pfn);
3217 if (intel_iommu_strict) {
3218 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3219 last_pfn - start_pfn + 1, !freelist, 0);
3221 __free_iova(&domain->iovad, iova);
3222 dma_free_pagelist(freelist);
3224 add_unmap(domain, iova, freelist);
3226 * queue up the release of the unmap to save the 1/6th of the
3227 * cpu used up by the iotlb flush operation...
3232 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3233 size_t size, enum dma_data_direction dir,
3234 struct dma_attrs *attrs)
3236 intel_unmap(dev, dev_addr);
3239 static void *intel_alloc_coherent(struct device *dev, size_t size,
3240 dma_addr_t *dma_handle, gfp_t flags,
3241 struct dma_attrs *attrs)
3243 struct page *page = NULL;
3246 size = PAGE_ALIGN(size);
3247 order = get_order(size);
3249 if (!iommu_no_mapping(dev))
3250 flags &= ~(GFP_DMA | GFP_DMA32);
3251 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3252 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3258 if (flags & __GFP_WAIT) {
3259 unsigned int count = size >> PAGE_SHIFT;
3261 page = dma_alloc_from_contiguous(dev, count, order);
3262 if (page && iommu_no_mapping(dev) &&
3263 page_to_phys(page) + size > dev->coherent_dma_mask) {
3264 dma_release_from_contiguous(dev, page, count);
3270 page = alloc_pages(flags, order);
3273 memset(page_address(page), 0, size);
3275 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3277 dev->coherent_dma_mask);
3279 return page_address(page);
3280 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3281 __free_pages(page, order);
3286 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3287 dma_addr_t dma_handle, struct dma_attrs *attrs)
3290 struct page *page = virt_to_page(vaddr);
3292 size = PAGE_ALIGN(size);
3293 order = get_order(size);
3295 intel_unmap(dev, dma_handle);
3296 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3297 __free_pages(page, order);
3300 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3301 int nelems, enum dma_data_direction dir,
3302 struct dma_attrs *attrs)
3304 intel_unmap(dev, sglist[0].dma_address);
3307 static int intel_nontranslate_map_sg(struct device *hddev,
3308 struct scatterlist *sglist, int nelems, int dir)
3311 struct scatterlist *sg;
3313 for_each_sg(sglist, sg, nelems, i) {
3314 BUG_ON(!sg_page(sg));
3315 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3316 sg->dma_length = sg->length;
3321 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3322 enum dma_data_direction dir, struct dma_attrs *attrs)
3325 struct dmar_domain *domain;
3328 struct iova *iova = NULL;
3330 struct scatterlist *sg;
3331 unsigned long start_vpfn;
3332 struct intel_iommu *iommu;
3334 BUG_ON(dir == DMA_NONE);
3335 if (iommu_no_mapping(dev))
3336 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3338 domain = get_valid_domain_for_dev(dev);
3342 iommu = domain_get_iommu(domain);
3344 for_each_sg(sglist, sg, nelems, i)
3345 size += aligned_nrpages(sg->offset, sg->length);
3347 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3350 sglist->dma_length = 0;
3355 * Check if DMAR supports zero-length reads on write only
3358 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3359 !cap_zlr(iommu->cap))
3360 prot |= DMA_PTE_READ;
3361 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3362 prot |= DMA_PTE_WRITE;
3364 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3366 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3367 if (unlikely(ret)) {
3368 dma_pte_free_pagetable(domain, start_vpfn,
3369 start_vpfn + size - 1);
3370 __free_iova(&domain->iovad, iova);
3374 /* it's a non-present to present mapping. Only flush if caching mode */
3375 if (cap_caching_mode(iommu->cap))
3376 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3378 iommu_flush_write_buffer(iommu);
3383 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3388 struct dma_map_ops intel_dma_ops = {
3389 .alloc = intel_alloc_coherent,
3390 .free = intel_free_coherent,
3391 .map_sg = intel_map_sg,
3392 .unmap_sg = intel_unmap_sg,
3393 .map_page = intel_map_page,
3394 .unmap_page = intel_unmap_page,
3395 .mapping_error = intel_mapping_error,
3398 static inline int iommu_domain_cache_init(void)
3402 iommu_domain_cache = kmem_cache_create("iommu_domain",
3403 sizeof(struct dmar_domain),
3408 if (!iommu_domain_cache) {
3409 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3416 static inline int iommu_devinfo_cache_init(void)
3420 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3421 sizeof(struct device_domain_info),
3425 if (!iommu_devinfo_cache) {
3426 printk(KERN_ERR "Couldn't create devinfo cache\n");
3433 static int __init iommu_init_mempool(void)
3436 ret = iommu_iova_cache_init();
3440 ret = iommu_domain_cache_init();
3444 ret = iommu_devinfo_cache_init();
3448 kmem_cache_destroy(iommu_domain_cache);
3450 iommu_iova_cache_destroy();
3455 static void __init iommu_exit_mempool(void)
3457 kmem_cache_destroy(iommu_devinfo_cache);
3458 kmem_cache_destroy(iommu_domain_cache);
3459 iommu_iova_cache_destroy();
3462 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3464 struct dmar_drhd_unit *drhd;
3468 /* We know that this device on this chipset has its own IOMMU.
3469 * If we find it under a different IOMMU, then the BIOS is lying
3470 * to us. Hope that the IOMMU for this device is actually
3471 * disabled, and it needs no translation...
3473 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3475 /* "can't" happen */
3476 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3479 vtbar &= 0xffff0000;
3481 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3482 drhd = dmar_find_matched_drhd_unit(pdev);
3483 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3484 TAINT_FIRMWARE_WORKAROUND,
3485 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3486 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3488 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3490 static void __init init_no_remapping_devices(void)
3492 struct dmar_drhd_unit *drhd;
3496 for_each_drhd_unit(drhd) {
3497 if (!drhd->include_all) {
3498 for_each_active_dev_scope(drhd->devices,
3499 drhd->devices_cnt, i, dev)
3501 /* ignore DMAR unit if no devices exist */
3502 if (i == drhd->devices_cnt)
3507 for_each_active_drhd_unit(drhd) {
3508 if (drhd->include_all)
3511 for_each_active_dev_scope(drhd->devices,
3512 drhd->devices_cnt, i, dev)
3513 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3515 if (i < drhd->devices_cnt)
3518 /* This IOMMU has *only* gfx devices. Either bypass it or
3519 set the gfx_mapped flag, as appropriate */
3521 intel_iommu_gfx_mapped = 1;
3524 for_each_active_dev_scope(drhd->devices,
3525 drhd->devices_cnt, i, dev)
3526 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3531 #ifdef CONFIG_SUSPEND
3532 static int init_iommu_hw(void)
3534 struct dmar_drhd_unit *drhd;
3535 struct intel_iommu *iommu = NULL;
3537 for_each_active_iommu(iommu, drhd)
3539 dmar_reenable_qi(iommu);
3541 for_each_iommu(iommu, drhd) {
3542 if (drhd->ignored) {
3544 * we always have to disable PMRs or DMA may fail on
3548 iommu_disable_protect_mem_regions(iommu);
3552 iommu_flush_write_buffer(iommu);
3554 iommu_set_root_entry(iommu);
3556 iommu->flush.flush_context(iommu, 0, 0, 0,
3557 DMA_CCMD_GLOBAL_INVL);
3558 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3559 iommu_enable_translation(iommu);
3560 iommu_disable_protect_mem_regions(iommu);
3566 static void iommu_flush_all(void)
3568 struct dmar_drhd_unit *drhd;
3569 struct intel_iommu *iommu;
3571 for_each_active_iommu(iommu, drhd) {
3572 iommu->flush.flush_context(iommu, 0, 0, 0,
3573 DMA_CCMD_GLOBAL_INVL);
3574 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3575 DMA_TLB_GLOBAL_FLUSH);
3579 static int iommu_suspend(void)
3581 struct dmar_drhd_unit *drhd;
3582 struct intel_iommu *iommu = NULL;
3585 for_each_active_iommu(iommu, drhd) {
3586 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3588 if (!iommu->iommu_state)
3594 for_each_active_iommu(iommu, drhd) {
3595 iommu_disable_translation(iommu);
3597 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3599 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3600 readl(iommu->reg + DMAR_FECTL_REG);
3601 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3602 readl(iommu->reg + DMAR_FEDATA_REG);
3603 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3604 readl(iommu->reg + DMAR_FEADDR_REG);
3605 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3606 readl(iommu->reg + DMAR_FEUADDR_REG);
3608 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3613 for_each_active_iommu(iommu, drhd)
3614 kfree(iommu->iommu_state);
3619 static void iommu_resume(void)
3621 struct dmar_drhd_unit *drhd;
3622 struct intel_iommu *iommu = NULL;
3625 if (init_iommu_hw()) {
3627 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3629 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3633 for_each_active_iommu(iommu, drhd) {
3635 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3637 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3638 iommu->reg + DMAR_FECTL_REG);
3639 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3640 iommu->reg + DMAR_FEDATA_REG);
3641 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3642 iommu->reg + DMAR_FEADDR_REG);
3643 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3644 iommu->reg + DMAR_FEUADDR_REG);
3646 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3649 for_each_active_iommu(iommu, drhd)
3650 kfree(iommu->iommu_state);
3653 static struct syscore_ops iommu_syscore_ops = {
3654 .resume = iommu_resume,
3655 .suspend = iommu_suspend,
3658 static void __init init_iommu_pm_ops(void)
3660 register_syscore_ops(&iommu_syscore_ops);
3664 static inline void init_iommu_pm_ops(void) {}
3665 #endif /* CONFIG_PM */
3668 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3670 struct acpi_dmar_reserved_memory *rmrr;
3671 struct dmar_rmrr_unit *rmrru;
3673 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3677 rmrru->hdr = header;
3678 rmrr = (struct acpi_dmar_reserved_memory *)header;
3679 rmrru->base_address = rmrr->base_address;
3680 rmrru->end_address = rmrr->end_address;
3681 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3682 ((void *)rmrr) + rmrr->header.length,
3683 &rmrru->devices_cnt);
3684 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3689 list_add(&rmrru->list, &dmar_rmrr_units);
3694 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3696 struct dmar_atsr_unit *atsru;
3697 struct acpi_dmar_atsr *tmp;
3699 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3700 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3701 if (atsr->segment != tmp->segment)
3703 if (atsr->header.length != tmp->header.length)
3705 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3712 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3714 struct acpi_dmar_atsr *atsr;
3715 struct dmar_atsr_unit *atsru;
3717 if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3720 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3721 atsru = dmar_find_atsr(atsr);
3725 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3730 * If memory is allocated from slab by ACPI _DSM method, we need to
3731 * copy the memory content because the memory buffer will be freed
3734 atsru->hdr = (void *)(atsru + 1);
3735 memcpy(atsru->hdr, hdr, hdr->length);
3736 atsru->include_all = atsr->flags & 0x1;
3737 if (!atsru->include_all) {
3738 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3739 (void *)atsr + atsr->header.length,
3740 &atsru->devices_cnt);
3741 if (atsru->devices_cnt && atsru->devices == NULL) {
3747 list_add_rcu(&atsru->list, &dmar_atsr_units);
3752 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3754 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3758 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3760 struct acpi_dmar_atsr *atsr;
3761 struct dmar_atsr_unit *atsru;
3763 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3764 atsru = dmar_find_atsr(atsr);
3766 list_del_rcu(&atsru->list);
3768 intel_iommu_free_atsr(atsru);
3774 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3778 struct acpi_dmar_atsr *atsr;
3779 struct dmar_atsr_unit *atsru;
3781 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3782 atsru = dmar_find_atsr(atsr);
3786 if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
3787 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3794 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3797 struct intel_iommu *iommu = dmaru->iommu;
3799 if (g_iommus[iommu->seq_id])
3802 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3803 pr_warn("IOMMU: %s doesn't support hardware pass through.\n",
3807 if (!ecap_sc_support(iommu->ecap) &&
3808 domain_update_iommu_snooping(iommu)) {
3809 pr_warn("IOMMU: %s doesn't support snooping.\n",
3813 sp = domain_update_iommu_superpage(iommu) - 1;
3814 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3815 pr_warn("IOMMU: %s doesn't support large page.\n",
3821 * Disable translation if already enabled prior to OS handover.
3823 if (iommu->gcmd & DMA_GCMD_TE)
3824 iommu_disable_translation(iommu);
3826 g_iommus[iommu->seq_id] = iommu;
3827 ret = iommu_init_domains(iommu);
3829 ret = iommu_alloc_root_entry(iommu);
3833 if (dmaru->ignored) {
3835 * we always have to disable PMRs or DMA may fail on this device
3838 iommu_disable_protect_mem_regions(iommu);
3842 intel_iommu_init_qi(iommu);
3843 iommu_flush_write_buffer(iommu);
3844 ret = dmar_set_interrupt(iommu);
3848 iommu_set_root_entry(iommu);
3849 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3850 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3851 iommu_enable_translation(iommu);
3854 ret = iommu_attach_domain(si_domain, iommu);
3855 if (ret < 0 || si_domain->id != ret)
3857 domain_attach_iommu(si_domain, iommu);
3860 iommu_disable_protect_mem_regions(iommu);
3864 disable_dmar_iommu(iommu);
3866 free_dmar_iommu(iommu);
3870 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3873 struct intel_iommu *iommu = dmaru->iommu;
3875 if (!intel_iommu_enabled)
3881 ret = intel_iommu_add(dmaru);
3883 disable_dmar_iommu(iommu);
3884 free_dmar_iommu(iommu);
3890 static void intel_iommu_free_dmars(void)
3892 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3893 struct dmar_atsr_unit *atsru, *atsr_n;
3895 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3896 list_del(&rmrru->list);
3897 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3901 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3902 list_del(&atsru->list);
3903 intel_iommu_free_atsr(atsru);
3907 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3910 struct pci_bus *bus;
3911 struct pci_dev *bridge = NULL;
3913 struct acpi_dmar_atsr *atsr;
3914 struct dmar_atsr_unit *atsru;
3916 dev = pci_physfn(dev);
3917 for (bus = dev->bus; bus; bus = bus->parent) {
3919 if (!bridge || !pci_is_pcie(bridge) ||
3920 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3922 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3929 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3930 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3931 if (atsr->segment != pci_domain_nr(dev->bus))
3934 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3935 if (tmp == &bridge->dev)
3938 if (atsru->include_all)
3948 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3951 struct dmar_rmrr_unit *rmrru;
3952 struct dmar_atsr_unit *atsru;
3953 struct acpi_dmar_atsr *atsr;
3954 struct acpi_dmar_reserved_memory *rmrr;
3956 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3959 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3960 rmrr = container_of(rmrru->hdr,
3961 struct acpi_dmar_reserved_memory, header);
3962 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3963 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3964 ((void *)rmrr) + rmrr->header.length,
3965 rmrr->segment, rmrru->devices,
3966 rmrru->devices_cnt);
3969 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3970 dmar_remove_dev_scope(info, rmrr->segment,
3971 rmrru->devices, rmrru->devices_cnt);
3975 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3976 if (atsru->include_all)
3979 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3980 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3981 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3982 (void *)atsr + atsr->header.length,
3983 atsr->segment, atsru->devices,
3984 atsru->devices_cnt);
3989 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3990 if (dmar_remove_dev_scope(info, atsr->segment,
3991 atsru->devices, atsru->devices_cnt))
4000 * Here we only respond to action of unbound device from driver.
4002 * Added device is not attached to its DMAR domain here yet. That will happen
4003 * when mapping the device to iova.
4005 static int device_notifier(struct notifier_block *nb,
4006 unsigned long action, void *data)
4008 struct device *dev = data;
4009 struct dmar_domain *domain;
4011 if (iommu_dummy(dev))
4014 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4017 domain = find_domain(dev);
4021 down_read(&dmar_global_lock);
4022 domain_remove_one_dev_info(domain, dev);
4023 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4024 domain_exit(domain);
4025 up_read(&dmar_global_lock);
4030 static struct notifier_block device_nb = {
4031 .notifier_call = device_notifier,
4034 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4035 unsigned long val, void *v)
4037 struct memory_notify *mhp = v;
4038 unsigned long long start, end;
4039 unsigned long start_vpfn, last_vpfn;
4042 case MEM_GOING_ONLINE:
4043 start = mhp->start_pfn << PAGE_SHIFT;
4044 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4045 if (iommu_domain_identity_map(si_domain, start, end)) {
4046 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
4053 case MEM_CANCEL_ONLINE:
4054 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4055 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4056 while (start_vpfn <= last_vpfn) {
4058 struct dmar_drhd_unit *drhd;
4059 struct intel_iommu *iommu;
4060 struct page *freelist;
4062 iova = find_iova(&si_domain->iovad, start_vpfn);
4064 pr_debug("dmar: failed get IOVA for PFN %lx\n",
4069 iova = split_and_remove_iova(&si_domain->iovad, iova,
4070 start_vpfn, last_vpfn);
4072 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
4073 start_vpfn, last_vpfn);
4077 freelist = domain_unmap(si_domain, iova->pfn_lo,
4081 for_each_active_iommu(iommu, drhd)
4082 iommu_flush_iotlb_psi(iommu, si_domain->id,
4083 iova->pfn_lo, iova_size(iova),
4086 dma_free_pagelist(freelist);
4088 start_vpfn = iova->pfn_hi + 1;
4089 free_iova_mem(iova);
4097 static struct notifier_block intel_iommu_memory_nb = {
4098 .notifier_call = intel_iommu_memory_notifier,
4103 static ssize_t intel_iommu_show_version(struct device *dev,
4104 struct device_attribute *attr,
4107 struct intel_iommu *iommu = dev_get_drvdata(dev);
4108 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4109 return sprintf(buf, "%d:%d\n",
4110 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4112 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4114 static ssize_t intel_iommu_show_address(struct device *dev,
4115 struct device_attribute *attr,
4118 struct intel_iommu *iommu = dev_get_drvdata(dev);
4119 return sprintf(buf, "%llx\n", iommu->reg_phys);
4121 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4123 static ssize_t intel_iommu_show_cap(struct device *dev,
4124 struct device_attribute *attr,
4127 struct intel_iommu *iommu = dev_get_drvdata(dev);
4128 return sprintf(buf, "%llx\n", iommu->cap);
4130 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4132 static ssize_t intel_iommu_show_ecap(struct device *dev,
4133 struct device_attribute *attr,
4136 struct intel_iommu *iommu = dev_get_drvdata(dev);
4137 return sprintf(buf, "%llx\n", iommu->ecap);
4139 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4141 static struct attribute *intel_iommu_attrs[] = {
4142 &dev_attr_version.attr,
4143 &dev_attr_address.attr,
4145 &dev_attr_ecap.attr,
4149 static struct attribute_group intel_iommu_group = {
4150 .name = "intel-iommu",
4151 .attrs = intel_iommu_attrs,
4154 const struct attribute_group *intel_iommu_groups[] = {
4159 int __init intel_iommu_init(void)
4162 struct dmar_drhd_unit *drhd;
4163 struct intel_iommu *iommu;
4165 /* VT-d is required for a TXT/tboot launch, so enforce that */
4166 force_on = tboot_force_iommu();
4168 if (iommu_init_mempool()) {
4170 panic("tboot: Failed to initialize iommu memory\n");
4174 down_write(&dmar_global_lock);
4175 if (dmar_table_init()) {
4177 panic("tboot: Failed to initialize DMAR table\n");
4182 * Disable translation if already enabled prior to OS handover.
4184 for_each_active_iommu(iommu, drhd)
4185 if (iommu->gcmd & DMA_GCMD_TE)
4186 iommu_disable_translation(iommu);
4188 if (dmar_dev_scope_init() < 0) {
4190 panic("tboot: Failed to initialize DMAR device scope\n");
4194 if (no_iommu || dmar_disabled)
4197 if (list_empty(&dmar_rmrr_units))
4198 printk(KERN_INFO "DMAR: No RMRR found\n");
4200 if (list_empty(&dmar_atsr_units))
4201 printk(KERN_INFO "DMAR: No ATSR found\n");
4203 if (dmar_init_reserved_ranges()) {
4205 panic("tboot: Failed to reserve iommu ranges\n");
4206 goto out_free_reserved_range;
4209 init_no_remapping_devices();
4214 panic("tboot: Failed to initialize DMARs\n");
4215 printk(KERN_ERR "IOMMU: dmar init failed\n");
4216 goto out_free_reserved_range;
4218 up_write(&dmar_global_lock);
4220 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4222 init_timer(&unmap_timer);
4223 #ifdef CONFIG_SWIOTLB
4226 dma_ops = &intel_dma_ops;
4228 init_iommu_pm_ops();
4230 for_each_active_iommu(iommu, drhd)
4231 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4235 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4236 bus_register_notifier(&pci_bus_type, &device_nb);
4237 if (si_domain && !hw_pass_through)
4238 register_memory_notifier(&intel_iommu_memory_nb);
4240 intel_iommu_enabled = 1;
4244 out_free_reserved_range:
4245 put_iova_domain(&reserved_iova_list);
4247 intel_iommu_free_dmars();
4248 up_write(&dmar_global_lock);
4249 iommu_exit_mempool();
4253 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4255 struct intel_iommu *iommu = opaque;
4257 iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4262 * NB - intel-iommu lacks any sort of reference counting for the users of
4263 * dependent devices. If multiple endpoints have intersecting dependent
4264 * devices, unbinding the driver from any one of them will possibly leave
4265 * the others unable to operate.
4267 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4270 if (!iommu || !dev || !dev_is_pci(dev))
4273 pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4276 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4279 struct device_domain_info *info, *tmp;
4280 struct intel_iommu *iommu;
4281 unsigned long flags;
4285 iommu = device_to_iommu(dev, &bus, &devfn);
4289 spin_lock_irqsave(&device_domain_lock, flags);
4290 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4291 if (info->iommu == iommu && info->bus == bus &&
4292 info->devfn == devfn) {
4293 unlink_domain_info(info);
4294 spin_unlock_irqrestore(&device_domain_lock, flags);
4296 iommu_disable_dev_iotlb(info);
4297 iommu_detach_dev(iommu, info->bus, info->devfn);
4298 iommu_detach_dependent_devices(iommu, dev);
4299 free_devinfo_mem(info);
4301 spin_lock_irqsave(&device_domain_lock, flags);
4309 /* if there is no other devices under the same iommu
4310 * owned by this domain, clear this iommu in iommu_bmp
4311 * update iommu count and coherency
4313 if (info->iommu == iommu)
4317 spin_unlock_irqrestore(&device_domain_lock, flags);
4320 domain_detach_iommu(domain, iommu);
4321 if (!domain_type_is_vm_or_si(domain))
4322 iommu_detach_domain(domain, iommu);
4326 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4330 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4332 domain_reserve_special_ranges(domain);
4334 /* calculate AGAW */
4335 domain->gaw = guest_width;
4336 adjust_width = guestwidth_to_adjustwidth(guest_width);
4337 domain->agaw = width_to_agaw(adjust_width);
4339 domain->iommu_coherency = 0;
4340 domain->iommu_snooping = 0;
4341 domain->iommu_superpage = 0;
4342 domain->max_addr = 0;
4344 /* always allocate the top pgd */
4345 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4348 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4352 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4354 struct dmar_domain *dmar_domain;
4355 struct iommu_domain *domain;
4357 if (type != IOMMU_DOMAIN_UNMANAGED)
4360 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4363 "intel_iommu_domain_init: dmar_domain == NULL\n");
4366 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4368 "intel_iommu_domain_init() failed\n");
4369 domain_exit(dmar_domain);
4372 domain_update_iommu_cap(dmar_domain);
4374 domain = &dmar_domain->domain;
4375 domain->geometry.aperture_start = 0;
4376 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4377 domain->geometry.force_aperture = true;
4382 static void intel_iommu_domain_free(struct iommu_domain *domain)
4384 domain_exit(to_dmar_domain(domain));
4387 static int intel_iommu_attach_device(struct iommu_domain *domain,
4390 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391 struct intel_iommu *iommu;
4395 if (device_is_rmrr_locked(dev)) {
4396 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4400 /* normally dev is not mapped */
4401 if (unlikely(domain_context_mapped(dev))) {
4402 struct dmar_domain *old_domain;
4404 old_domain = find_domain(dev);
4406 if (domain_type_is_vm_or_si(dmar_domain))
4407 domain_remove_one_dev_info(old_domain, dev);
4409 domain_remove_dev_info(old_domain);
4411 if (!domain_type_is_vm_or_si(old_domain) &&
4412 list_empty(&old_domain->devices))
4413 domain_exit(old_domain);
4417 iommu = device_to_iommu(dev, &bus, &devfn);
4421 /* check if this iommu agaw is sufficient for max mapped address */
4422 addr_width = agaw_to_width(iommu->agaw);
4423 if (addr_width > cap_mgaw(iommu->cap))
4424 addr_width = cap_mgaw(iommu->cap);
4426 if (dmar_domain->max_addr > (1LL << addr_width)) {
4427 printk(KERN_ERR "%s: iommu width (%d) is not "
4428 "sufficient for the mapped address (%llx)\n",
4429 __func__, addr_width, dmar_domain->max_addr);
4432 dmar_domain->gaw = addr_width;
4435 * Knock out extra levels of page tables if necessary
4437 while (iommu->agaw < dmar_domain->agaw) {
4438 struct dma_pte *pte;
4440 pte = dmar_domain->pgd;
4441 if (dma_pte_present(pte)) {
4442 dmar_domain->pgd = (struct dma_pte *)
4443 phys_to_virt(dma_pte_addr(pte));
4444 free_pgtable_page(pte);
4446 dmar_domain->agaw--;
4449 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4452 static void intel_iommu_detach_device(struct iommu_domain *domain,
4455 domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4458 static int intel_iommu_map(struct iommu_domain *domain,
4459 unsigned long iova, phys_addr_t hpa,
4460 size_t size, int iommu_prot)
4462 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4467 if (iommu_prot & IOMMU_READ)
4468 prot |= DMA_PTE_READ;
4469 if (iommu_prot & IOMMU_WRITE)
4470 prot |= DMA_PTE_WRITE;
4471 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4472 prot |= DMA_PTE_SNP;
4474 max_addr = iova + size;
4475 if (dmar_domain->max_addr < max_addr) {
4478 /* check if minimum agaw is sufficient for mapped address */
4479 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4480 if (end < max_addr) {
4481 printk(KERN_ERR "%s: iommu width (%d) is not "
4482 "sufficient for the mapped address (%llx)\n",
4483 __func__, dmar_domain->gaw, max_addr);
4486 dmar_domain->max_addr = max_addr;
4488 /* Round up size to next multiple of PAGE_SIZE, if it and
4489 the low bits of hpa would take us onto the next page */
4490 size = aligned_nrpages(hpa, size);
4491 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4492 hpa >> VTD_PAGE_SHIFT, size, prot);
4496 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4497 unsigned long iova, size_t size)
4499 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4500 struct page *freelist = NULL;
4501 struct intel_iommu *iommu;
4502 unsigned long start_pfn, last_pfn;
4503 unsigned int npages;
4504 int iommu_id, num, ndomains, level = 0;
4506 /* Cope with horrid API which requires us to unmap more than the
4507 size argument if it happens to be a large-page mapping. */
4508 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4511 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4512 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4514 start_pfn = iova >> VTD_PAGE_SHIFT;
4515 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4517 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4519 npages = last_pfn - start_pfn + 1;
4521 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4522 iommu = g_iommus[iommu_id];
4525 * find bit position of dmar_domain
4527 ndomains = cap_ndoms(iommu->cap);
4528 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4529 if (iommu->domains[num] == dmar_domain)
4530 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4531 npages, !freelist, 0);
4536 dma_free_pagelist(freelist);
4538 if (dmar_domain->max_addr == iova + size)
4539 dmar_domain->max_addr = iova;
4544 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4547 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4548 struct dma_pte *pte;
4552 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4554 phys = dma_pte_addr(pte);
4559 static bool intel_iommu_capable(enum iommu_cap cap)
4561 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4562 return domain_update_iommu_snooping(NULL) == 1;
4563 if (cap == IOMMU_CAP_INTR_REMAP)
4564 return irq_remapping_enabled == 1;
4569 static int intel_iommu_add_device(struct device *dev)
4571 struct intel_iommu *iommu;
4572 struct iommu_group *group;
4575 iommu = device_to_iommu(dev, &bus, &devfn);
4579 iommu_device_link(iommu->iommu_dev, dev);
4581 group = iommu_group_get_for_dev(dev);
4584 return PTR_ERR(group);
4586 iommu_group_put(group);
4590 static void intel_iommu_remove_device(struct device *dev)
4592 struct intel_iommu *iommu;
4595 iommu = device_to_iommu(dev, &bus, &devfn);
4599 iommu_group_remove_device(dev);
4601 iommu_device_unlink(iommu->iommu_dev, dev);
4604 static const struct iommu_ops intel_iommu_ops = {
4605 .capable = intel_iommu_capable,
4606 .domain_alloc = intel_iommu_domain_alloc,
4607 .domain_free = intel_iommu_domain_free,
4608 .attach_dev = intel_iommu_attach_device,
4609 .detach_dev = intel_iommu_detach_device,
4610 .map = intel_iommu_map,
4611 .unmap = intel_iommu_unmap,
4612 .map_sg = default_iommu_map_sg,
4613 .iova_to_phys = intel_iommu_iova_to_phys,
4614 .add_device = intel_iommu_add_device,
4615 .remove_device = intel_iommu_remove_device,
4616 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4619 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4621 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4622 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4628 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4629 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4634 static void quirk_iommu_rwbf(struct pci_dev *dev)
4637 * Mobile 4 Series Chipset neglects to set RWBF capability,
4638 * but needs it. Same seems to hold for the desktop versions.
4640 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4653 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4654 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4655 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4656 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4657 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4658 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4659 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4660 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4662 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4666 if (pci_read_config_word(dev, GGC, &ggc))
4669 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4670 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4672 } else if (dmar_map_gfx) {
4673 /* we have to ensure the gfx device is idle before we flush */
4674 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4675 intel_iommu_strict = 1;
4678 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4679 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4680 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4681 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4683 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4684 ISOCH DMAR unit for the Azalia sound device, but not give it any
4685 TLB entries, which causes it to deadlock. Check for that. We do
4686 this in a function called from init_dmars(), instead of in a PCI
4687 quirk, because we don't want to print the obnoxious "BIOS broken"
4688 message if VT-d is actually disabled.
4690 static void __init check_tylersburg_isoch(void)
4692 struct pci_dev *pdev;
4693 uint32_t vtisochctrl;
4695 /* If there's no Azalia in the system anyway, forget it. */
4696 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4701 /* System Management Registers. Might be hidden, in which case
4702 we can't do the sanity check. But that's OK, because the
4703 known-broken BIOSes _don't_ actually hide it, so far. */
4704 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4708 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4715 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4716 if (vtisochctrl & 1)
4719 /* Drop all bits other than the number of TLB entries */
4720 vtisochctrl &= 0x1c;
4722 /* If we have the recommended number of TLB entries (16), fine. */
4723 if (vtisochctrl == 0x10)
4726 /* Zero TLB entries? You get to ride the short bus to school. */
4728 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4729 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4730 dmi_get_system_info(DMI_BIOS_VENDOR),
4731 dmi_get_system_info(DMI_BIOS_VERSION),
4732 dmi_get_system_info(DMI_PRODUCT_VERSION));
4733 iommu_identity_mapping |= IDENTMAP_AZALIA;
4737 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",