2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
51 #include "irq_remapping.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
86 /* page table handling */
87 #define LEVEL_STRIDE (9)
88 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
91 * This bitmap is used to advertise the page sizes our hardware support
92 * to the IOMMU core, which will then use this information to split
93 * physically contiguous memory regions it is mapping into page sizes
96 * Traditionally the IOMMU core just handed us the mappings directly,
97 * after making sure the size is an order of a 4KiB page and that the
98 * mapping has natural alignment.
100 * To retain this behavior, we currently advertise that we support
101 * all page sizes that are an order of 4KiB.
103 * If at some point we'd like to utilize the IOMMU core's new behavior,
104 * we could change this to advertise the real page sizes we support.
106 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
108 static inline int agaw_to_level(int agaw)
113 static inline int agaw_to_width(int agaw)
115 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 static inline int width_to_agaw(int width)
120 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 static inline unsigned int level_to_offset_bits(int level)
125 return (level - 1) * LEVEL_STRIDE;
128 static inline int pfn_level_offset(unsigned long pfn, int level)
130 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 static inline unsigned long level_mask(int level)
135 return -1UL << level_to_offset_bits(level);
138 static inline unsigned long level_size(int level)
140 return 1UL << level_to_offset_bits(level);
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
145 return (pfn + level_size(level) - 1) & level_mask(level);
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
150 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154 are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
157 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
162 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
166 return mm_to_dma_pfn(page_to_pfn(pg));
168 static inline unsigned long virt_to_dma_pfn(void *p)
170 return page_to_dma_pfn(virt_to_page(p));
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
180 * set to 1 to panic kernel if can't successfully enable VT-d
181 * (used when kernel is launched w/ TXT)
183 static int force_on = 0;
188 * 12-63: Context Ptr (12 - (haw-1))
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
198 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
206 return re->lo & VTD_PAGE_MASK;
210 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
218 return re->hi & VTD_PAGE_MASK;
223 * 1: fault processing disable
224 * 2-3: translation type
225 * 12-63: address space root
231 struct context_entry {
236 static inline void context_clear_pasid_enable(struct context_entry *context)
238 context->lo &= ~(1ULL << 11);
241 static inline bool context_pasid_enabled(struct context_entry *context)
243 return !!(context->lo & (1ULL << 11));
246 static inline void context_set_copied(struct context_entry *context)
248 context->hi |= (1ull << 3);
251 static inline bool context_copied(struct context_entry *context)
253 return !!(context->hi & (1ULL << 3));
256 static inline bool __context_present(struct context_entry *context)
258 return (context->lo & 1);
261 static inline bool context_present(struct context_entry *context)
263 return context_pasid_enabled(context) ?
264 __context_present(context) :
265 __context_present(context) && !context_copied(context);
268 static inline void context_set_present(struct context_entry *context)
273 static inline void context_set_fault_enable(struct context_entry *context)
275 context->lo &= (((u64)-1) << 2) | 1;
278 static inline void context_set_translation_type(struct context_entry *context,
281 context->lo &= (((u64)-1) << 4) | 3;
282 context->lo |= (value & 3) << 2;
285 static inline void context_set_address_root(struct context_entry *context,
288 context->lo &= ~VTD_PAGE_MASK;
289 context->lo |= value & VTD_PAGE_MASK;
292 static inline void context_set_address_width(struct context_entry *context,
295 context->hi |= value & 7;
298 static inline void context_set_domain_id(struct context_entry *context,
301 context->hi |= (value & ((1 << 16) - 1)) << 8;
304 static inline int context_domain_id(struct context_entry *c)
306 return((c->hi >> 8) & 0xffff);
309 static inline void context_clear_entry(struct context_entry *context)
322 * 12-63: Host physcial address
328 static inline void dma_clear_pte(struct dma_pte *pte)
333 static inline u64 dma_pte_addr(struct dma_pte *pte)
336 return pte->val & VTD_PAGE_MASK;
338 /* Must have a full atomic 64-bit read */
339 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
343 static inline bool dma_pte_present(struct dma_pte *pte)
345 return (pte->val & 3) != 0;
348 static inline bool dma_pte_superpage(struct dma_pte *pte)
350 return (pte->val & DMA_PTE_LARGE_PAGE);
353 static inline int first_pte_in_page(struct dma_pte *pte)
355 return !((unsigned long)pte & ~VTD_PAGE_MASK);
359 * This domain is a statically identity mapping domain.
360 * 1. This domain creats a static 1:1 mapping to all usable memory.
361 * 2. It maps to each iommu if successful.
362 * 3. Each iommu mapps to this domain if successful.
364 static struct dmar_domain *si_domain;
365 static int hw_pass_through = 1;
368 * Domain represents a virtual machine, more than one devices
369 * across iommus may be owned in one domain, e.g. kvm guest.
371 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
373 /* si_domain contains mulitple devices */
374 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
376 #define for_each_domain_iommu(idx, domain) \
377 for (idx = 0; idx < g_num_of_iommus; idx++) \
378 if (domain->iommu_refcnt[idx])
381 int nid; /* node id */
383 unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED];
384 /* Refcount of devices per iommu */
387 u16 iommu_did[DMAR_UNITS_SUPPORTED];
388 /* Domain ids per IOMMU. Use u16 since
389 * domain ids are 16 bit wide according
390 * to VT-d spec, section 9.3 */
392 struct list_head devices; /* all devices' list */
393 struct iova_domain iovad; /* iova's that belong to this domain */
395 struct dma_pte *pgd; /* virtual address */
396 int gaw; /* max guest address width */
398 /* adjusted guest address width, 0 is level 2 30-bit */
401 int flags; /* flags to find out type of domain */
403 int iommu_coherency;/* indicate coherency of iommu access */
404 int iommu_snooping; /* indicate snooping control feature*/
405 int iommu_count; /* reference count of iommu */
406 int iommu_superpage;/* Level of superpages supported:
407 0 == 4KiB (no superpages), 1 == 2MiB,
408 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
409 u64 max_addr; /* maximum mapped address */
411 struct iommu_domain domain; /* generic domain data structure for
415 /* PCI domain-device relationship */
416 struct device_domain_info {
417 struct list_head link; /* link to domain siblings */
418 struct list_head global; /* link to global list */
419 u8 bus; /* PCI bus number */
420 u8 devfn; /* PCI devfn number */
421 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
422 struct intel_iommu *iommu; /* IOMMU used by this device */
423 struct dmar_domain *domain; /* pointer to domain */
426 struct dmar_rmrr_unit {
427 struct list_head list; /* list of rmrr units */
428 struct acpi_dmar_header *hdr; /* ACPI header */
429 u64 base_address; /* reserved base address*/
430 u64 end_address; /* reserved end address */
431 struct dmar_dev_scope *devices; /* target devices */
432 int devices_cnt; /* target device count */
435 struct dmar_atsr_unit {
436 struct list_head list; /* list of ATSR units */
437 struct acpi_dmar_header *hdr; /* ACPI header */
438 struct dmar_dev_scope *devices; /* target devices */
439 int devices_cnt; /* target device count */
440 u8 include_all:1; /* include all ports */
443 static LIST_HEAD(dmar_atsr_units);
444 static LIST_HEAD(dmar_rmrr_units);
446 #define for_each_rmrr_units(rmrr) \
447 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
449 static void flush_unmaps_timeout(unsigned long data);
451 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
453 #define HIGH_WATER_MARK 250
454 struct deferred_flush_tables {
456 struct iova *iova[HIGH_WATER_MARK];
457 struct dmar_domain *domain[HIGH_WATER_MARK];
458 struct page *freelist[HIGH_WATER_MARK];
461 static struct deferred_flush_tables *deferred_flush;
463 /* bitmap for indexing intel_iommus */
464 static int g_num_of_iommus;
466 static DEFINE_SPINLOCK(async_umap_flush_lock);
467 static LIST_HEAD(unmaps_to_do);
470 static long list_size;
472 static void domain_exit(struct dmar_domain *domain);
473 static void domain_remove_dev_info(struct dmar_domain *domain);
474 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
476 static void domain_context_clear(struct intel_iommu *iommu,
478 static void __dmar_remove_one_dev_info(struct dmar_domain *domain,
480 static int domain_detach_iommu(struct dmar_domain *domain,
481 struct intel_iommu *iommu);
483 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
484 int dmar_disabled = 0;
486 int dmar_disabled = 1;
487 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
489 int intel_iommu_enabled = 0;
490 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
492 static int dmar_map_gfx = 1;
493 static int dmar_forcedac;
494 static int intel_iommu_strict;
495 static int intel_iommu_superpage = 1;
496 static int intel_iommu_ecs = 1;
498 /* We only actually use ECS when PASID support (on the new bit 40)
499 * is also advertised. Some early implementations — the ones with
500 * PASID support on bit 28 — have issues even when we *only* use
501 * extended root/context tables. */
502 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
503 ecap_pasid(iommu->ecap))
505 int intel_iommu_gfx_mapped;
506 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
508 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
509 static DEFINE_SPINLOCK(device_domain_lock);
510 static LIST_HEAD(device_domain_list);
512 static const struct iommu_ops intel_iommu_ops;
514 static bool translation_pre_enabled(struct intel_iommu *iommu)
516 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
519 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
521 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
524 static void init_translation_status(struct intel_iommu *iommu)
528 gsts = readl(iommu->reg + DMAR_GSTS_REG);
529 if (gsts & DMA_GSTS_TES)
530 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
533 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
534 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
536 return container_of(dom, struct dmar_domain, domain);
539 static int __init intel_iommu_setup(char *str)
544 if (!strncmp(str, "on", 2)) {
546 pr_info("IOMMU enabled\n");
547 } else if (!strncmp(str, "off", 3)) {
549 pr_info("IOMMU disabled\n");
550 } else if (!strncmp(str, "igfx_off", 8)) {
552 pr_info("Disable GFX device mapping\n");
553 } else if (!strncmp(str, "forcedac", 8)) {
554 pr_info("Forcing DAC for PCI devices\n");
556 } else if (!strncmp(str, "strict", 6)) {
557 pr_info("Disable batched IOTLB flush\n");
558 intel_iommu_strict = 1;
559 } else if (!strncmp(str, "sp_off", 6)) {
560 pr_info("Disable supported super page\n");
561 intel_iommu_superpage = 0;
562 } else if (!strncmp(str, "ecs_off", 7)) {
564 "Intel-IOMMU: disable extended context table support\n");
568 str += strcspn(str, ",");
574 __setup("intel_iommu=", intel_iommu_setup);
576 static struct kmem_cache *iommu_domain_cache;
577 static struct kmem_cache *iommu_devinfo_cache;
579 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
581 struct dmar_domain **domains;
584 domains = iommu->domains[idx];
588 return domains[did & 0xff];
591 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
592 struct dmar_domain *domain)
594 struct dmar_domain **domains;
597 if (!iommu->domains[idx]) {
598 size_t size = 256 * sizeof(struct dmar_domain *);
599 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
602 domains = iommu->domains[idx];
603 if (WARN_ON(!domains))
606 domains[did & 0xff] = domain;
609 static inline void *alloc_pgtable_page(int node)
614 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
616 vaddr = page_address(page);
620 static inline void free_pgtable_page(void *vaddr)
622 free_page((unsigned long)vaddr);
625 static inline void *alloc_domain_mem(void)
627 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
630 static void free_domain_mem(void *vaddr)
632 kmem_cache_free(iommu_domain_cache, vaddr);
635 static inline void * alloc_devinfo_mem(void)
637 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
640 static inline void free_devinfo_mem(void *vaddr)
642 kmem_cache_free(iommu_devinfo_cache, vaddr);
645 static inline int domain_type_is_vm(struct dmar_domain *domain)
647 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
650 static inline int domain_type_is_si(struct dmar_domain *domain)
652 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
655 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
657 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
658 DOMAIN_FLAG_STATIC_IDENTITY);
661 static inline int domain_pfn_supported(struct dmar_domain *domain,
664 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
666 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
669 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
674 sagaw = cap_sagaw(iommu->cap);
675 for (agaw = width_to_agaw(max_gaw);
677 if (test_bit(agaw, &sagaw))
685 * Calculate max SAGAW for each iommu.
687 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
689 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
693 * calculate agaw for each iommu.
694 * "SAGAW" may be different across iommus, use a default agaw, and
695 * get a supported less agaw for iommus that don't support the default agaw.
697 int iommu_calculate_agaw(struct intel_iommu *iommu)
699 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
702 /* This functionin only returns single iommu in a domain */
703 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
707 /* si_domain and vm domain should not get here. */
708 BUG_ON(domain_type_is_vm_or_si(domain));
709 for_each_domain_iommu(iommu_id, domain)
712 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
715 return g_iommus[iommu_id];
718 static void domain_update_iommu_coherency(struct dmar_domain *domain)
720 struct dmar_drhd_unit *drhd;
721 struct intel_iommu *iommu;
725 domain->iommu_coherency = 1;
727 for_each_domain_iommu(i, domain) {
729 if (!ecap_coherent(g_iommus[i]->ecap)) {
730 domain->iommu_coherency = 0;
737 /* No hardware attached; use lowest common denominator */
739 for_each_active_iommu(iommu, drhd) {
740 if (!ecap_coherent(iommu->ecap)) {
741 domain->iommu_coherency = 0;
748 static int domain_update_iommu_snooping(struct intel_iommu *skip)
750 struct dmar_drhd_unit *drhd;
751 struct intel_iommu *iommu;
755 for_each_active_iommu(iommu, drhd) {
757 if (!ecap_sc_support(iommu->ecap)) {
768 static int domain_update_iommu_superpage(struct intel_iommu *skip)
770 struct dmar_drhd_unit *drhd;
771 struct intel_iommu *iommu;
774 if (!intel_iommu_superpage) {
778 /* set iommu_superpage to the smallest common denominator */
780 for_each_active_iommu(iommu, drhd) {
782 mask &= cap_super_page_val(iommu->cap);
792 /* Some capabilities may be different across iommus */
793 static void domain_update_iommu_cap(struct dmar_domain *domain)
795 domain_update_iommu_coherency(domain);
796 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
797 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
800 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
801 u8 bus, u8 devfn, int alloc)
803 struct root_entry *root = &iommu->root_entry[bus];
804 struct context_entry *context;
807 if (ecs_enabled(iommu)) {
816 context = phys_to_virt(*entry & VTD_PAGE_MASK);
818 unsigned long phy_addr;
822 context = alloc_pgtable_page(iommu->node);
826 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
827 phy_addr = virt_to_phys((void *)context);
828 *entry = phy_addr | 1;
829 __iommu_flush_cache(iommu, entry, sizeof(*entry));
831 return &context[devfn];
834 static int iommu_dummy(struct device *dev)
836 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
839 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
841 struct dmar_drhd_unit *drhd = NULL;
842 struct intel_iommu *iommu;
844 struct pci_dev *ptmp, *pdev = NULL;
848 if (iommu_dummy(dev))
851 if (dev_is_pci(dev)) {
852 pdev = to_pci_dev(dev);
853 segment = pci_domain_nr(pdev->bus);
854 } else if (has_acpi_companion(dev))
855 dev = &ACPI_COMPANION(dev)->dev;
858 for_each_active_iommu(iommu, drhd) {
859 if (pdev && segment != drhd->segment)
862 for_each_active_dev_scope(drhd->devices,
863 drhd->devices_cnt, i, tmp) {
865 *bus = drhd->devices[i].bus;
866 *devfn = drhd->devices[i].devfn;
870 if (!pdev || !dev_is_pci(tmp))
873 ptmp = to_pci_dev(tmp);
874 if (ptmp->subordinate &&
875 ptmp->subordinate->number <= pdev->bus->number &&
876 ptmp->subordinate->busn_res.end >= pdev->bus->number)
880 if (pdev && drhd->include_all) {
882 *bus = pdev->bus->number;
883 *devfn = pdev->devfn;
894 static void domain_flush_cache(struct dmar_domain *domain,
895 void *addr, int size)
897 if (!domain->iommu_coherency)
898 clflush_cache_range(addr, size);
901 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
903 struct context_entry *context;
907 spin_lock_irqsave(&iommu->lock, flags);
908 context = iommu_context_addr(iommu, bus, devfn, 0);
910 ret = context_present(context);
911 spin_unlock_irqrestore(&iommu->lock, flags);
915 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
917 struct context_entry *context;
920 spin_lock_irqsave(&iommu->lock, flags);
921 context = iommu_context_addr(iommu, bus, devfn, 0);
923 context_clear_entry(context);
924 __iommu_flush_cache(iommu, context, sizeof(*context));
926 spin_unlock_irqrestore(&iommu->lock, flags);
929 static void free_context_table(struct intel_iommu *iommu)
933 struct context_entry *context;
935 spin_lock_irqsave(&iommu->lock, flags);
936 if (!iommu->root_entry) {
939 for (i = 0; i < ROOT_ENTRY_NR; i++) {
940 context = iommu_context_addr(iommu, i, 0, 0);
942 free_pgtable_page(context);
944 if (!ecs_enabled(iommu))
947 context = iommu_context_addr(iommu, i, 0x80, 0);
949 free_pgtable_page(context);
952 free_pgtable_page(iommu->root_entry);
953 iommu->root_entry = NULL;
955 spin_unlock_irqrestore(&iommu->lock, flags);
958 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
959 unsigned long pfn, int *target_level)
961 struct dma_pte *parent, *pte = NULL;
962 int level = agaw_to_level(domain->agaw);
965 BUG_ON(!domain->pgd);
967 if (!domain_pfn_supported(domain, pfn))
968 /* Address beyond IOMMU's addressing capabilities. */
971 parent = domain->pgd;
976 offset = pfn_level_offset(pfn, level);
977 pte = &parent[offset];
978 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
980 if (level == *target_level)
983 if (!dma_pte_present(pte)) {
986 tmp_page = alloc_pgtable_page(domain->nid);
991 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
992 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
993 if (cmpxchg64(&pte->val, 0ULL, pteval))
994 /* Someone else set it while we were thinking; use theirs. */
995 free_pgtable_page(tmp_page);
997 domain_flush_cache(domain, pte, sizeof(*pte));
1002 parent = phys_to_virt(dma_pte_addr(pte));
1007 *target_level = level;
1013 /* return address's pte at specific level */
1014 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1016 int level, int *large_page)
1018 struct dma_pte *parent, *pte = NULL;
1019 int total = agaw_to_level(domain->agaw);
1022 parent = domain->pgd;
1023 while (level <= total) {
1024 offset = pfn_level_offset(pfn, total);
1025 pte = &parent[offset];
1029 if (!dma_pte_present(pte)) {
1030 *large_page = total;
1034 if (dma_pte_superpage(pte)) {
1035 *large_page = total;
1039 parent = phys_to_virt(dma_pte_addr(pte));
1045 /* clear last level pte, a tlb flush should be followed */
1046 static void dma_pte_clear_range(struct dmar_domain *domain,
1047 unsigned long start_pfn,
1048 unsigned long last_pfn)
1050 unsigned int large_page = 1;
1051 struct dma_pte *first_pte, *pte;
1053 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1054 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1055 BUG_ON(start_pfn > last_pfn);
1057 /* we don't need lock here; nobody else touches the iova range */
1060 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1062 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1067 start_pfn += lvl_to_nr_pages(large_page);
1069 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1071 domain_flush_cache(domain, first_pte,
1072 (void *)pte - (void *)first_pte);
1074 } while (start_pfn && start_pfn <= last_pfn);
1077 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1078 struct dma_pte *pte, unsigned long pfn,
1079 unsigned long start_pfn, unsigned long last_pfn)
1081 pfn = max(start_pfn, pfn);
1082 pte = &pte[pfn_level_offset(pfn, level)];
1085 unsigned long level_pfn;
1086 struct dma_pte *level_pte;
1088 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1091 level_pfn = pfn & level_mask(level - 1);
1092 level_pte = phys_to_virt(dma_pte_addr(pte));
1095 dma_pte_free_level(domain, level - 1, level_pte,
1096 level_pfn, start_pfn, last_pfn);
1098 /* If range covers entire pagetable, free it */
1099 if (!(start_pfn > level_pfn ||
1100 last_pfn < level_pfn + level_size(level) - 1)) {
1102 domain_flush_cache(domain, pte, sizeof(*pte));
1103 free_pgtable_page(level_pte);
1106 pfn += level_size(level);
1107 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1110 /* free page table pages. last level pte should already be cleared */
1111 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1112 unsigned long start_pfn,
1113 unsigned long last_pfn)
1115 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1116 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1117 BUG_ON(start_pfn > last_pfn);
1119 dma_pte_clear_range(domain, start_pfn, last_pfn);
1121 /* We don't need lock here; nobody else touches the iova range */
1122 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1123 domain->pgd, 0, start_pfn, last_pfn);
1126 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1127 free_pgtable_page(domain->pgd);
1132 /* When a page at a given level is being unlinked from its parent, we don't
1133 need to *modify* it at all. All we need to do is make a list of all the
1134 pages which can be freed just as soon as we've flushed the IOTLB and we
1135 know the hardware page-walk will no longer touch them.
1136 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1138 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1139 int level, struct dma_pte *pte,
1140 struct page *freelist)
1144 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1145 pg->freelist = freelist;
1151 pte = page_address(pg);
1153 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1154 freelist = dma_pte_list_pagetables(domain, level - 1,
1157 } while (!first_pte_in_page(pte));
1162 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1163 struct dma_pte *pte, unsigned long pfn,
1164 unsigned long start_pfn,
1165 unsigned long last_pfn,
1166 struct page *freelist)
1168 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1170 pfn = max(start_pfn, pfn);
1171 pte = &pte[pfn_level_offset(pfn, level)];
1174 unsigned long level_pfn;
1176 if (!dma_pte_present(pte))
1179 level_pfn = pfn & level_mask(level);
1181 /* If range covers entire pagetable, free it */
1182 if (start_pfn <= level_pfn &&
1183 last_pfn >= level_pfn + level_size(level) - 1) {
1184 /* These suborbinate page tables are going away entirely. Don't
1185 bother to clear them; we're just going to *free* them. */
1186 if (level > 1 && !dma_pte_superpage(pte))
1187 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1193 } else if (level > 1) {
1194 /* Recurse down into a level that isn't *entirely* obsolete */
1195 freelist = dma_pte_clear_level(domain, level - 1,
1196 phys_to_virt(dma_pte_addr(pte)),
1197 level_pfn, start_pfn, last_pfn,
1201 pfn += level_size(level);
1202 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1205 domain_flush_cache(domain, first_pte,
1206 (void *)++last_pte - (void *)first_pte);
1211 /* We can't just free the pages because the IOMMU may still be walking
1212 the page tables, and may have cached the intermediate levels. The
1213 pages can only be freed after the IOTLB flush has been done. */
1214 struct page *domain_unmap(struct dmar_domain *domain,
1215 unsigned long start_pfn,
1216 unsigned long last_pfn)
1218 struct page *freelist = NULL;
1220 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1221 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1222 BUG_ON(start_pfn > last_pfn);
1224 /* we don't need lock here; nobody else touches the iova range */
1225 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1226 domain->pgd, 0, start_pfn, last_pfn, NULL);
1229 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1230 struct page *pgd_page = virt_to_page(domain->pgd);
1231 pgd_page->freelist = freelist;
1232 freelist = pgd_page;
1240 void dma_free_pagelist(struct page *freelist)
1244 while ((pg = freelist)) {
1245 freelist = pg->freelist;
1246 free_pgtable_page(page_address(pg));
1250 /* iommu handling */
1251 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1253 struct root_entry *root;
1254 unsigned long flags;
1256 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1258 pr_err("Allocating root entry for %s failed\n",
1263 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1265 spin_lock_irqsave(&iommu->lock, flags);
1266 iommu->root_entry = root;
1267 spin_unlock_irqrestore(&iommu->lock, flags);
1272 static void iommu_set_root_entry(struct intel_iommu *iommu)
1278 addr = virt_to_phys(iommu->root_entry);
1279 if (ecs_enabled(iommu))
1280 addr |= DMA_RTADDR_RTT;
1282 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1283 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1285 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1287 /* Make sure hardware complete it */
1288 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1289 readl, (sts & DMA_GSTS_RTPS), sts);
1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1299 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1302 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1303 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1305 /* Make sure hardware complete it */
1306 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1307 readl, (!(val & DMA_GSTS_WBFS)), val);
1309 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312 /* return value determine if we need a write buffer flush */
1313 static void __iommu_flush_context(struct intel_iommu *iommu,
1314 u16 did, u16 source_id, u8 function_mask,
1321 case DMA_CCMD_GLOBAL_INVL:
1322 val = DMA_CCMD_GLOBAL_INVL;
1324 case DMA_CCMD_DOMAIN_INVL:
1325 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1327 case DMA_CCMD_DEVICE_INVL:
1328 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1329 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1334 val |= DMA_CCMD_ICC;
1336 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1337 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1339 /* Make sure hardware complete it */
1340 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1341 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1343 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346 /* return value determine if we need a write buffer flush */
1347 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1348 u64 addr, unsigned int size_order, u64 type)
1350 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1351 u64 val = 0, val_iva = 0;
1355 case DMA_TLB_GLOBAL_FLUSH:
1356 /* global flush doesn't need set IVA_REG */
1357 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1359 case DMA_TLB_DSI_FLUSH:
1360 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1362 case DMA_TLB_PSI_FLUSH:
1363 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1364 /* IH bit is passed in as part of address */
1365 val_iva = size_order | addr;
1370 /* Note: set drain read/write */
1373 * This is probably to be super secure.. Looks like we can
1374 * ignore it without any impact.
1376 if (cap_read_drain(iommu->cap))
1377 val |= DMA_TLB_READ_DRAIN;
1379 if (cap_write_drain(iommu->cap))
1380 val |= DMA_TLB_WRITE_DRAIN;
1382 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1383 /* Note: Only uses first TLB reg currently */
1385 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1386 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1388 /* Make sure hardware complete it */
1389 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1390 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1392 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1394 /* check IOTLB invalidation granularity */
1395 if (DMA_TLB_IAIG(val) == 0)
1396 pr_err("Flush IOTLB failed\n");
1397 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1398 pr_debug("TLB flush request %Lx, actual %Lx\n",
1399 (unsigned long long)DMA_TLB_IIRG(type),
1400 (unsigned long long)DMA_TLB_IAIG(val));
1403 static struct device_domain_info *
1404 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1408 struct device_domain_info *info;
1409 struct pci_dev *pdev;
1411 assert_spin_locked(&device_domain_lock);
1413 if (!ecap_dev_iotlb_support(iommu->ecap))
1419 list_for_each_entry(info, &domain->devices, link)
1420 if (info->iommu == iommu && info->bus == bus &&
1421 info->devfn == devfn) {
1426 if (!found || !info->dev || !dev_is_pci(info->dev))
1429 pdev = to_pci_dev(info->dev);
1431 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1434 if (!dmar_find_matched_atsr_unit(pdev))
1440 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1442 if (!info || !dev_is_pci(info->dev))
1445 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1448 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1450 if (!info->dev || !dev_is_pci(info->dev) ||
1451 !pci_ats_enabled(to_pci_dev(info->dev)))
1454 pci_disable_ats(to_pci_dev(info->dev));
1457 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1458 u64 addr, unsigned mask)
1461 unsigned long flags;
1462 struct device_domain_info *info;
1464 spin_lock_irqsave(&device_domain_lock, flags);
1465 list_for_each_entry(info, &domain->devices, link) {
1466 struct pci_dev *pdev;
1467 if (!info->dev || !dev_is_pci(info->dev))
1470 pdev = to_pci_dev(info->dev);
1471 if (!pci_ats_enabled(pdev))
1474 sid = info->bus << 8 | info->devfn;
1475 qdep = pci_ats_queue_depth(pdev);
1476 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1478 spin_unlock_irqrestore(&device_domain_lock, flags);
1481 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1482 struct dmar_domain *domain,
1483 unsigned long pfn, unsigned int pages,
1486 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1487 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1488 u16 did = domain->iommu_did[iommu->seq_id];
1495 * Fallback to domain selective flush if no PSI support or the size is
1497 * PSI requires page size to be 2 ^ x, and the base address is naturally
1498 * aligned to the size
1500 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1501 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1504 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1508 * In caching mode, changes of pages from non-present to present require
1509 * flush. However, device IOTLB doesn't need to be flushed in this case.
1511 if (!cap_caching_mode(iommu->cap) || !map)
1512 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1516 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1519 unsigned long flags;
1521 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1522 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1523 pmen &= ~DMA_PMEN_EPM;
1524 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1526 /* wait for the protected region status bit to clear */
1527 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1528 readl, !(pmen & DMA_PMEN_PRS), pmen);
1530 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1533 static void iommu_enable_translation(struct intel_iommu *iommu)
1536 unsigned long flags;
1538 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1539 iommu->gcmd |= DMA_GCMD_TE;
1540 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1542 /* Make sure hardware complete it */
1543 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1544 readl, (sts & DMA_GSTS_TES), sts);
1546 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1549 static void iommu_disable_translation(struct intel_iommu *iommu)
1554 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1555 iommu->gcmd &= ~DMA_GCMD_TE;
1556 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1558 /* Make sure hardware complete it */
1559 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1560 readl, (!(sts & DMA_GSTS_TES)), sts);
1562 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1566 static int iommu_init_domains(struct intel_iommu *iommu)
1568 u32 ndomains, nlongs;
1571 ndomains = cap_ndoms(iommu->cap);
1572 pr_debug("%s: Number of Domains supported <%d>\n",
1573 iommu->name, ndomains);
1574 nlongs = BITS_TO_LONGS(ndomains);
1576 spin_lock_init(&iommu->lock);
1578 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1579 if (!iommu->domain_ids) {
1580 pr_err("%s: Allocating domain id array failed\n",
1585 size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1586 iommu->domains = kzalloc(size, GFP_KERNEL);
1588 if (iommu->domains) {
1589 size = 256 * sizeof(struct dmar_domain *);
1590 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1593 if (!iommu->domains || !iommu->domains[0]) {
1594 pr_err("%s: Allocating domain array failed\n",
1596 kfree(iommu->domain_ids);
1597 kfree(iommu->domains);
1598 iommu->domain_ids = NULL;
1599 iommu->domains = NULL;
1606 * If Caching mode is set, then invalid translations are tagged
1607 * with domain-id 0, hence we need to pre-allocate it. We also
1608 * use domain-id 0 as a marker for non-allocated domain-id, so
1609 * make sure it is not used for a real domain.
1611 set_bit(0, iommu->domain_ids);
1616 static void disable_dmar_iommu(struct intel_iommu *iommu)
1618 struct device_domain_info *info, *tmp;
1619 unsigned long flags;
1621 if (!iommu->domains || !iommu->domain_ids)
1624 spin_lock_irqsave(&device_domain_lock, flags);
1625 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1626 struct dmar_domain *domain;
1628 if (info->iommu != iommu)
1631 if (!info->dev || !info->domain)
1634 domain = info->domain;
1636 dmar_remove_one_dev_info(domain, info->dev);
1638 if (!domain_type_is_vm_or_si(domain))
1639 domain_exit(domain);
1641 spin_unlock_irqrestore(&device_domain_lock, flags);
1643 if (iommu->gcmd & DMA_GCMD_TE)
1644 iommu_disable_translation(iommu);
1647 static void free_dmar_iommu(struct intel_iommu *iommu)
1649 if ((iommu->domains) && (iommu->domain_ids)) {
1650 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1653 for (i = 0; i < elems; i++)
1654 kfree(iommu->domains[i]);
1655 kfree(iommu->domains);
1656 kfree(iommu->domain_ids);
1657 iommu->domains = NULL;
1658 iommu->domain_ids = NULL;
1661 g_iommus[iommu->seq_id] = NULL;
1663 /* free context mapping */
1664 free_context_table(iommu);
1667 static struct dmar_domain *alloc_domain(int flags)
1669 struct dmar_domain *domain;
1671 domain = alloc_domain_mem();
1675 memset(domain, 0, sizeof(*domain));
1677 domain->flags = flags;
1678 INIT_LIST_HEAD(&domain->devices);
1683 /* Must be called with iommu->lock */
1684 static int domain_attach_iommu(struct dmar_domain *domain,
1685 struct intel_iommu *iommu)
1687 unsigned long ndomains;
1690 assert_spin_locked(&device_domain_lock);
1691 assert_spin_locked(&iommu->lock);
1693 domain->iommu_refcnt[iommu->seq_id] += 1;
1694 domain->iommu_count += 1;
1695 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1696 ndomains = cap_ndoms(iommu->cap);
1697 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1699 if (num >= ndomains) {
1700 pr_err("%s: No free domain ids\n", iommu->name);
1701 domain->iommu_refcnt[iommu->seq_id] -= 1;
1702 domain->iommu_count -= 1;
1706 set_bit(num, iommu->domain_ids);
1707 set_iommu_domain(iommu, num, domain);
1709 domain->iommu_did[iommu->seq_id] = num;
1710 domain->nid = iommu->node;
1712 domain_update_iommu_cap(domain);
1718 static int domain_detach_iommu(struct dmar_domain *domain,
1719 struct intel_iommu *iommu)
1721 int num, count = INT_MAX;
1723 assert_spin_locked(&device_domain_lock);
1724 assert_spin_locked(&iommu->lock);
1726 domain->iommu_refcnt[iommu->seq_id] -= 1;
1727 count = --domain->iommu_count;
1728 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1729 num = domain->iommu_did[iommu->seq_id];
1730 clear_bit(num, iommu->domain_ids);
1731 set_iommu_domain(iommu, num, NULL);
1733 domain_update_iommu_cap(domain);
1734 domain->iommu_did[iommu->seq_id] = 0;
1740 static struct iova_domain reserved_iova_list;
1741 static struct lock_class_key reserved_rbtree_key;
1743 static int dmar_init_reserved_ranges(void)
1745 struct pci_dev *pdev = NULL;
1749 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1752 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1753 &reserved_rbtree_key);
1755 /* IOAPIC ranges shouldn't be accessed by DMA */
1756 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1757 IOVA_PFN(IOAPIC_RANGE_END));
1759 pr_err("Reserve IOAPIC range failed\n");
1763 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1764 for_each_pci_dev(pdev) {
1767 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1768 r = &pdev->resource[i];
1769 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1771 iova = reserve_iova(&reserved_iova_list,
1775 pr_err("Reserve iova failed\n");
1783 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1785 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1788 static inline int guestwidth_to_adjustwidth(int gaw)
1791 int r = (gaw - 12) % 9;
1802 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1805 int adjust_width, agaw;
1806 unsigned long sagaw;
1808 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1810 domain_reserve_special_ranges(domain);
1812 /* calculate AGAW */
1813 if (guest_width > cap_mgaw(iommu->cap))
1814 guest_width = cap_mgaw(iommu->cap);
1815 domain->gaw = guest_width;
1816 adjust_width = guestwidth_to_adjustwidth(guest_width);
1817 agaw = width_to_agaw(adjust_width);
1818 sagaw = cap_sagaw(iommu->cap);
1819 if (!test_bit(agaw, &sagaw)) {
1820 /* hardware doesn't support it, choose a bigger one */
1821 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1822 agaw = find_next_bit(&sagaw, 5, agaw);
1826 domain->agaw = agaw;
1828 if (ecap_coherent(iommu->ecap))
1829 domain->iommu_coherency = 1;
1831 domain->iommu_coherency = 0;
1833 if (ecap_sc_support(iommu->ecap))
1834 domain->iommu_snooping = 1;
1836 domain->iommu_snooping = 0;
1838 if (intel_iommu_superpage)
1839 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1841 domain->iommu_superpage = 0;
1843 domain->nid = iommu->node;
1845 /* always allocate the top pgd */
1846 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1849 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1853 static void domain_exit(struct dmar_domain *domain)
1855 struct page *freelist = NULL;
1857 /* Domain 0 is reserved, so dont process it */
1861 /* Flush any lazy unmaps that may reference this domain */
1862 if (!intel_iommu_strict)
1863 flush_unmaps_timeout(0);
1865 /* Remove associated devices and clear attached or cached domains */
1867 domain_remove_dev_info(domain);
1871 put_iova_domain(&domain->iovad);
1873 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1875 dma_free_pagelist(freelist);
1877 free_domain_mem(domain);
1880 static int domain_context_mapping_one(struct dmar_domain *domain,
1881 struct intel_iommu *iommu,
1884 u16 did = domain->iommu_did[iommu->seq_id];
1885 int translation = CONTEXT_TT_MULTI_LEVEL;
1886 struct device_domain_info *info = NULL;
1887 struct context_entry *context;
1888 unsigned long flags;
1889 struct dma_pte *pgd;
1894 if (hw_pass_through && domain_type_is_si(domain))
1895 translation = CONTEXT_TT_PASS_THROUGH;
1897 pr_debug("Set context mapping for %02x:%02x.%d\n",
1898 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1900 BUG_ON(!domain->pgd);
1902 spin_lock_irqsave(&device_domain_lock, flags);
1903 spin_lock(&iommu->lock);
1906 context = iommu_context_addr(iommu, bus, devfn, 1);
1911 if (context_present(context))
1916 context_clear_entry(context);
1917 context_set_domain_id(context, did);
1920 * Skip top levels of page tables for iommu which has less agaw
1921 * than default. Unnecessary for PT mode.
1923 if (translation != CONTEXT_TT_PASS_THROUGH) {
1924 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1926 pgd = phys_to_virt(dma_pte_addr(pgd));
1927 if (!dma_pte_present(pgd))
1931 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1932 translation = info ? CONTEXT_TT_DEV_IOTLB :
1933 CONTEXT_TT_MULTI_LEVEL;
1935 context_set_address_root(context, virt_to_phys(pgd));
1936 context_set_address_width(context, iommu->agaw);
1939 * In pass through mode, AW must be programmed to
1940 * indicate the largest AGAW value supported by
1941 * hardware. And ASR is ignored by hardware.
1943 context_set_address_width(context, iommu->msagaw);
1946 context_set_translation_type(context, translation);
1947 context_set_fault_enable(context);
1948 context_set_present(context);
1949 domain_flush_cache(domain, context, sizeof(*context));
1952 * It's a non-present to present mapping. If hardware doesn't cache
1953 * non-present entry we only need to flush the write-buffer. If the
1954 * _does_ cache non-present entries, then it does so in the special
1955 * domain #0, which we have to flush:
1957 if (cap_caching_mode(iommu->cap)) {
1958 iommu->flush.flush_context(iommu, 0,
1959 (((u16)bus) << 8) | devfn,
1960 DMA_CCMD_MASK_NOBIT,
1961 DMA_CCMD_DEVICE_INVL);
1962 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1964 iommu_flush_write_buffer(iommu);
1966 iommu_enable_dev_iotlb(info);
1971 spin_unlock(&iommu->lock);
1972 spin_unlock_irqrestore(&device_domain_lock, flags);
1977 struct domain_context_mapping_data {
1978 struct dmar_domain *domain;
1979 struct intel_iommu *iommu;
1982 static int domain_context_mapping_cb(struct pci_dev *pdev,
1983 u16 alias, void *opaque)
1985 struct domain_context_mapping_data *data = opaque;
1987 return domain_context_mapping_one(data->domain, data->iommu,
1988 PCI_BUS_NUM(alias), alias & 0xff);
1992 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1994 struct intel_iommu *iommu;
1996 struct domain_context_mapping_data data;
1998 iommu = device_to_iommu(dev, &bus, &devfn);
2002 if (!dev_is_pci(dev))
2003 return domain_context_mapping_one(domain, iommu, bus, devfn);
2005 data.domain = domain;
2008 return pci_for_each_dma_alias(to_pci_dev(dev),
2009 &domain_context_mapping_cb, &data);
2012 static int domain_context_mapped_cb(struct pci_dev *pdev,
2013 u16 alias, void *opaque)
2015 struct intel_iommu *iommu = opaque;
2017 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2020 static int domain_context_mapped(struct device *dev)
2022 struct intel_iommu *iommu;
2025 iommu = device_to_iommu(dev, &bus, &devfn);
2029 if (!dev_is_pci(dev))
2030 return device_context_mapped(iommu, bus, devfn);
2032 return !pci_for_each_dma_alias(to_pci_dev(dev),
2033 domain_context_mapped_cb, iommu);
2036 /* Returns a number of VTD pages, but aligned to MM page size */
2037 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2040 host_addr &= ~PAGE_MASK;
2041 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2044 /* Return largest possible superpage level for a given mapping */
2045 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2046 unsigned long iov_pfn,
2047 unsigned long phy_pfn,
2048 unsigned long pages)
2050 int support, level = 1;
2051 unsigned long pfnmerge;
2053 support = domain->iommu_superpage;
2055 /* To use a large page, the virtual *and* physical addresses
2056 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2057 of them will mean we have to use smaller pages. So just
2058 merge them and check both at once. */
2059 pfnmerge = iov_pfn | phy_pfn;
2061 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2062 pages >>= VTD_STRIDE_SHIFT;
2065 pfnmerge >>= VTD_STRIDE_SHIFT;
2072 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2073 struct scatterlist *sg, unsigned long phys_pfn,
2074 unsigned long nr_pages, int prot)
2076 struct dma_pte *first_pte = NULL, *pte = NULL;
2077 phys_addr_t uninitialized_var(pteval);
2078 unsigned long sg_res = 0;
2079 unsigned int largepage_lvl = 0;
2080 unsigned long lvl_pages = 0;
2082 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2084 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2087 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2091 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2094 while (nr_pages > 0) {
2098 sg_res = aligned_nrpages(sg->offset, sg->length);
2099 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2100 sg->dma_length = sg->length;
2101 pteval = page_to_phys(sg_page(sg)) | prot;
2102 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2106 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2108 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2111 /* It is large page*/
2112 if (largepage_lvl > 1) {
2113 pteval |= DMA_PTE_LARGE_PAGE;
2114 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2116 * Ensure that old small page tables are
2117 * removed to make room for superpage,
2120 dma_pte_free_pagetable(domain, iov_pfn,
2121 iov_pfn + lvl_pages - 1);
2123 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2127 /* We don't need lock here, nobody else
2128 * touches the iova range
2130 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2132 static int dumps = 5;
2133 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2134 iov_pfn, tmp, (unsigned long long)pteval);
2137 debug_dma_dump_mappings(NULL);
2142 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2144 BUG_ON(nr_pages < lvl_pages);
2145 BUG_ON(sg_res < lvl_pages);
2147 nr_pages -= lvl_pages;
2148 iov_pfn += lvl_pages;
2149 phys_pfn += lvl_pages;
2150 pteval += lvl_pages * VTD_PAGE_SIZE;
2151 sg_res -= lvl_pages;
2153 /* If the next PTE would be the first in a new page, then we
2154 need to flush the cache on the entries we've just written.
2155 And then we'll need to recalculate 'pte', so clear it and
2156 let it get set again in the if (!pte) block above.
2158 If we're done (!nr_pages) we need to flush the cache too.
2160 Also if we've been setting superpages, we may need to
2161 recalculate 'pte' and switch back to smaller pages for the
2162 end of the mapping, if the trailing size is not enough to
2163 use another superpage (i.e. sg_res < lvl_pages). */
2165 if (!nr_pages || first_pte_in_page(pte) ||
2166 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2167 domain_flush_cache(domain, first_pte,
2168 (void *)pte - (void *)first_pte);
2172 if (!sg_res && nr_pages)
2178 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2179 struct scatterlist *sg, unsigned long nr_pages,
2182 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2185 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2186 unsigned long phys_pfn, unsigned long nr_pages,
2189 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2192 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2197 clear_context_table(iommu, bus, devfn);
2198 iommu->flush.flush_context(iommu, 0, 0, 0,
2199 DMA_CCMD_GLOBAL_INVL);
2200 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2203 static inline void unlink_domain_info(struct device_domain_info *info)
2205 assert_spin_locked(&device_domain_lock);
2206 list_del(&info->link);
2207 list_del(&info->global);
2209 info->dev->archdata.iommu = NULL;
2212 static void domain_remove_dev_info(struct dmar_domain *domain)
2214 struct device_domain_info *info, *tmp;
2215 unsigned long flags;
2217 spin_lock_irqsave(&device_domain_lock, flags);
2218 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2219 __dmar_remove_one_dev_info(domain, info->dev);
2220 spin_unlock_irqrestore(&device_domain_lock, flags);
2225 * Note: we use struct device->archdata.iommu stores the info
2227 static struct dmar_domain *find_domain(struct device *dev)
2229 struct device_domain_info *info;
2231 /* No lock here, assumes no domain exit in normal case */
2232 info = dev->archdata.iommu;
2234 return info->domain;
2238 static inline struct device_domain_info *
2239 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2241 struct device_domain_info *info;
2243 list_for_each_entry(info, &device_domain_list, global)
2244 if (info->iommu->segment == segment && info->bus == bus &&
2245 info->devfn == devfn)
2251 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2254 struct dmar_domain *domain)
2256 struct dmar_domain *found = NULL;
2257 struct device_domain_info *info;
2258 unsigned long flags;
2261 info = alloc_devinfo_mem();
2266 info->devfn = devfn;
2268 info->domain = domain;
2269 info->iommu = iommu;
2271 spin_lock_irqsave(&device_domain_lock, flags);
2273 found = find_domain(dev);
2275 struct device_domain_info *info2;
2276 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2278 found = info2->domain;
2281 spin_unlock_irqrestore(&device_domain_lock, flags);
2282 free_devinfo_mem(info);
2283 /* Caller must free the original domain */
2287 spin_lock(&iommu->lock);
2288 ret = domain_attach_iommu(domain, iommu);
2289 spin_unlock(&iommu->lock);
2292 spin_unlock_irqrestore(&device_domain_lock, flags);
2296 list_add(&info->link, &domain->devices);
2297 list_add(&info->global, &device_domain_list);
2299 dev->archdata.iommu = info;
2300 spin_unlock_irqrestore(&device_domain_lock, flags);
2302 if (dev && domain_context_mapping(domain, dev)) {
2303 pr_err("Domain context map for %s failed\n", dev_name(dev));
2304 dmar_remove_one_dev_info(domain, dev);
2311 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2313 *(u16 *)opaque = alias;
2317 /* domain is initialized */
2318 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2320 struct device_domain_info *info = NULL;
2321 struct dmar_domain *domain, *tmp;
2322 struct intel_iommu *iommu;
2323 unsigned long flags;
2327 domain = find_domain(dev);
2331 iommu = device_to_iommu(dev, &bus, &devfn);
2335 if (dev_is_pci(dev)) {
2336 struct pci_dev *pdev = to_pci_dev(dev);
2338 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2340 spin_lock_irqsave(&device_domain_lock, flags);
2341 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2342 PCI_BUS_NUM(dma_alias),
2345 iommu = info->iommu;
2346 domain = info->domain;
2348 spin_unlock_irqrestore(&device_domain_lock, flags);
2350 /* DMA alias already has a domain, uses it */
2355 /* Allocate and initialize new domain for the device */
2356 domain = alloc_domain(0);
2359 if (domain_init(domain, iommu, gaw)) {
2360 domain_exit(domain);
2364 /* register PCI DMA alias device */
2365 if (dev_is_pci(dev)) {
2366 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2367 dma_alias & 0xff, NULL, domain);
2369 if (!tmp || tmp != domain) {
2370 domain_exit(domain);
2379 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2381 if (!tmp || tmp != domain) {
2382 domain_exit(domain);
2389 static int iommu_identity_mapping;
2390 #define IDENTMAP_ALL 1
2391 #define IDENTMAP_GFX 2
2392 #define IDENTMAP_AZALIA 4
2394 static int iommu_domain_identity_map(struct dmar_domain *domain,
2395 unsigned long long start,
2396 unsigned long long end)
2398 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2399 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2401 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2402 dma_to_mm_pfn(last_vpfn))) {
2403 pr_err("Reserving iova failed\n");
2407 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2409 * RMRR range might have overlap with physical memory range,
2412 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2414 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2415 last_vpfn - first_vpfn + 1,
2416 DMA_PTE_READ|DMA_PTE_WRITE);
2419 static int iommu_prepare_identity_map(struct device *dev,
2420 unsigned long long start,
2421 unsigned long long end)
2423 struct dmar_domain *domain;
2426 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2430 /* For _hardware_ passthrough, don't bother. But for software
2431 passthrough, we do it anyway -- it may indicate a memory
2432 range which is reserved in E820, so which didn't get set
2433 up to start with in si_domain */
2434 if (domain == si_domain && hw_pass_through) {
2435 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2436 dev_name(dev), start, end);
2440 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2441 dev_name(dev), start, end);
2444 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2445 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2446 dmi_get_system_info(DMI_BIOS_VENDOR),
2447 dmi_get_system_info(DMI_BIOS_VERSION),
2448 dmi_get_system_info(DMI_PRODUCT_VERSION));
2453 if (end >> agaw_to_width(domain->agaw)) {
2454 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2455 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2456 agaw_to_width(domain->agaw),
2457 dmi_get_system_info(DMI_BIOS_VENDOR),
2458 dmi_get_system_info(DMI_BIOS_VERSION),
2459 dmi_get_system_info(DMI_PRODUCT_VERSION));
2464 ret = iommu_domain_identity_map(domain, start, end);
2471 domain_exit(domain);
2475 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2478 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2480 return iommu_prepare_identity_map(dev, rmrr->base_address,
2484 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2485 static inline void iommu_prepare_isa(void)
2487 struct pci_dev *pdev;
2490 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2494 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2495 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2498 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2503 static inline void iommu_prepare_isa(void)
2507 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2509 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2511 static int __init si_domain_init(int hw)
2515 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2519 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2520 domain_exit(si_domain);
2524 pr_debug("Identity mapping domain allocated\n");
2529 for_each_online_node(nid) {
2530 unsigned long start_pfn, end_pfn;
2533 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2534 ret = iommu_domain_identity_map(si_domain,
2535 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2544 static int identity_mapping(struct device *dev)
2546 struct device_domain_info *info;
2548 if (likely(!iommu_identity_mapping))
2551 info = dev->archdata.iommu;
2552 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2553 return (info->domain == si_domain);
2558 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2560 struct dmar_domain *ndomain;
2561 struct intel_iommu *iommu;
2564 iommu = device_to_iommu(dev, &bus, &devfn);
2568 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2569 if (ndomain != domain)
2575 static bool device_has_rmrr(struct device *dev)
2577 struct dmar_rmrr_unit *rmrr;
2582 for_each_rmrr_units(rmrr) {
2584 * Return TRUE if this RMRR contains the device that
2587 for_each_active_dev_scope(rmrr->devices,
2588 rmrr->devices_cnt, i, tmp)
2599 * There are a couple cases where we need to restrict the functionality of
2600 * devices associated with RMRRs. The first is when evaluating a device for
2601 * identity mapping because problems exist when devices are moved in and out
2602 * of domains and their respective RMRR information is lost. This means that
2603 * a device with associated RMRRs will never be in a "passthrough" domain.
2604 * The second is use of the device through the IOMMU API. This interface
2605 * expects to have full control of the IOVA space for the device. We cannot
2606 * satisfy both the requirement that RMRR access is maintained and have an
2607 * unencumbered IOVA space. We also have no ability to quiesce the device's
2608 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2609 * We therefore prevent devices associated with an RMRR from participating in
2610 * the IOMMU API, which eliminates them from device assignment.
2612 * In both cases we assume that PCI USB devices with RMRRs have them largely
2613 * for historical reasons and that the RMRR space is not actively used post
2614 * boot. This exclusion may change if vendors begin to abuse it.
2616 * The same exception is made for graphics devices, with the requirement that
2617 * any use of the RMRR regions will be torn down before assigning the device
2620 static bool device_is_rmrr_locked(struct device *dev)
2622 if (!device_has_rmrr(dev))
2625 if (dev_is_pci(dev)) {
2626 struct pci_dev *pdev = to_pci_dev(dev);
2628 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2635 static int iommu_should_identity_map(struct device *dev, int startup)
2638 if (dev_is_pci(dev)) {
2639 struct pci_dev *pdev = to_pci_dev(dev);
2641 if (device_is_rmrr_locked(dev))
2644 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2647 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2650 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2654 * We want to start off with all devices in the 1:1 domain, and
2655 * take them out later if we find they can't access all of memory.
2657 * However, we can't do this for PCI devices behind bridges,
2658 * because all PCI devices behind the same bridge will end up
2659 * with the same source-id on their transactions.
2661 * Practically speaking, we can't change things around for these
2662 * devices at run-time, because we can't be sure there'll be no
2663 * DMA transactions in flight for any of their siblings.
2665 * So PCI devices (unless they're on the root bus) as well as
2666 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2667 * the 1:1 domain, just in _case_ one of their siblings turns out
2668 * not to be able to map all of memory.
2670 if (!pci_is_pcie(pdev)) {
2671 if (!pci_is_root_bus(pdev->bus))
2673 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2675 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2678 if (device_has_rmrr(dev))
2683 * At boot time, we don't yet know if devices will be 64-bit capable.
2684 * Assume that they will — if they turn out not to be, then we can
2685 * take them out of the 1:1 domain later.
2689 * If the device's dma_mask is less than the system's memory
2690 * size then this is not a candidate for identity mapping.
2692 u64 dma_mask = *dev->dma_mask;
2694 if (dev->coherent_dma_mask &&
2695 dev->coherent_dma_mask < dma_mask)
2696 dma_mask = dev->coherent_dma_mask;
2698 return dma_mask >= dma_get_required_mask(dev);
2704 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2708 if (!iommu_should_identity_map(dev, 1))
2711 ret = domain_add_dev_info(si_domain, dev);
2713 pr_info("%s identity mapping for device %s\n",
2714 hw ? "Hardware" : "Software", dev_name(dev));
2715 else if (ret == -ENODEV)
2716 /* device not associated with an iommu */
2723 static int __init iommu_prepare_static_identity_mapping(int hw)
2725 struct pci_dev *pdev = NULL;
2726 struct dmar_drhd_unit *drhd;
2727 struct intel_iommu *iommu;
2732 for_each_pci_dev(pdev) {
2733 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2738 for_each_active_iommu(iommu, drhd)
2739 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2740 struct acpi_device_physical_node *pn;
2741 struct acpi_device *adev;
2743 if (dev->bus != &acpi_bus_type)
2746 adev= to_acpi_device(dev);
2747 mutex_lock(&adev->physical_node_lock);
2748 list_for_each_entry(pn, &adev->physical_node_list, node) {
2749 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2753 mutex_unlock(&adev->physical_node_lock);
2761 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2764 * Start from the sane iommu hardware state.
2765 * If the queued invalidation is already initialized by us
2766 * (for example, while enabling interrupt-remapping) then
2767 * we got the things already rolling from a sane state.
2771 * Clear any previous faults.
2773 dmar_fault(-1, iommu);
2775 * Disable queued invalidation if supported and already enabled
2776 * before OS handover.
2778 dmar_disable_qi(iommu);
2781 if (dmar_enable_qi(iommu)) {
2783 * Queued Invalidate not enabled, use Register Based Invalidate
2785 iommu->flush.flush_context = __iommu_flush_context;
2786 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2787 pr_info("%s: Using Register based invalidation\n",
2790 iommu->flush.flush_context = qi_flush_context;
2791 iommu->flush.flush_iotlb = qi_flush_iotlb;
2792 pr_info("%s: Using Queued invalidation\n", iommu->name);
2796 static int copy_context_table(struct intel_iommu *iommu,
2797 struct root_entry *old_re,
2798 struct context_entry **tbl,
2801 struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
2802 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2803 phys_addr_t old_ce_phys;
2805 tbl_idx = ext ? bus * 2 : bus;
2807 for (devfn = 0; devfn < 256; devfn++) {
2808 /* First calculate the correct index */
2809 idx = (ext ? devfn * 2 : devfn) % 256;
2812 /* First save what we may have and clean up */
2814 tbl[tbl_idx] = new_ce;
2815 __iommu_flush_cache(iommu, new_ce,
2825 old_ce_phys = root_entry_lctp(old_re);
2827 old_ce_phys = root_entry_uctp(old_re);
2830 if (ext && devfn == 0) {
2831 /* No LCTP, try UCTP */
2840 old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2844 new_ce = alloc_pgtable_page(iommu->node);
2851 /* Now copy the context entry */
2854 if (!__context_present(&ce))
2857 did = context_domain_id(&ce);
2858 if (did >= 0 && did < cap_ndoms(iommu->cap))
2859 set_bit(did, iommu->domain_ids);
2862 * We need a marker for copied context entries. This
2863 * marker needs to work for the old format as well as
2864 * for extended context entries.
2866 * Bit 67 of the context entry is used. In the old
2867 * format this bit is available to software, in the
2868 * extended format it is the PGE bit, but PGE is ignored
2869 * by HW if PASIDs are disabled (and thus still
2872 * So disable PASIDs first and then mark the entry
2873 * copied. This means that we don't copy PASID
2874 * translations from the old kernel, but this is fine as
2875 * faults there are not fatal.
2877 context_clear_pasid_enable(&ce);
2878 context_set_copied(&ce);
2883 tbl[tbl_idx + pos] = new_ce;
2885 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2894 static int copy_translation_tables(struct intel_iommu *iommu)
2896 struct context_entry **ctxt_tbls;
2897 struct root_entry *old_rt;
2898 phys_addr_t old_rt_phys;
2899 int ctxt_table_entries;
2900 unsigned long flags;
2905 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2906 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2907 new_ext = !!ecap_ecs(iommu->ecap);
2910 * The RTT bit can only be changed when translation is disabled,
2911 * but disabling translation means to open a window for data
2912 * corruption. So bail out and don't copy anything if we would
2913 * have to change the bit.
2918 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2922 old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2926 /* This is too big for the stack - allocate it from slab */
2927 ctxt_table_entries = ext ? 512 : 256;
2929 ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2933 for (bus = 0; bus < 256; bus++) {
2934 ret = copy_context_table(iommu, &old_rt[bus],
2935 ctxt_tbls, bus, ext);
2937 pr_err("%s: Failed to copy context table for bus %d\n",
2943 spin_lock_irqsave(&iommu->lock, flags);
2945 /* Context tables are copied, now write them to the root_entry table */
2946 for (bus = 0; bus < 256; bus++) {
2947 int idx = ext ? bus * 2 : bus;
2950 if (ctxt_tbls[idx]) {
2951 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2952 iommu->root_entry[bus].lo = val;
2955 if (!ext || !ctxt_tbls[idx + 1])
2958 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2959 iommu->root_entry[bus].hi = val;
2962 spin_unlock_irqrestore(&iommu->lock, flags);
2966 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2976 static int __init init_dmars(void)
2978 struct dmar_drhd_unit *drhd;
2979 struct dmar_rmrr_unit *rmrr;
2980 bool copied_tables = false;
2982 struct intel_iommu *iommu;
2988 * initialize and program root entry to not present
2991 for_each_drhd_unit(drhd) {
2993 * lock not needed as this is only incremented in the single
2994 * threaded kernel __init code path all other access are read
2997 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3001 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3004 /* Preallocate enough resources for IOMMU hot-addition */
3005 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3006 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3008 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3011 pr_err("Allocating global iommu array failed\n");
3016 deferred_flush = kzalloc(g_num_of_iommus *
3017 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3018 if (!deferred_flush) {
3023 for_each_active_iommu(iommu, drhd) {
3024 g_iommus[iommu->seq_id] = iommu;
3026 intel_iommu_init_qi(iommu);
3028 ret = iommu_init_domains(iommu);
3032 init_translation_status(iommu);
3034 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3035 iommu_disable_translation(iommu);
3036 clear_translation_pre_enabled(iommu);
3037 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3043 * we could share the same root & context tables
3044 * among all IOMMU's. Need to Split it later.
3046 ret = iommu_alloc_root_entry(iommu);
3050 if (translation_pre_enabled(iommu)) {
3051 pr_info("Translation already enabled - trying to copy translation structures\n");
3053 ret = copy_translation_tables(iommu);
3056 * We found the IOMMU with translation
3057 * enabled - but failed to copy over the
3058 * old root-entry table. Try to proceed
3059 * by disabling translation now and
3060 * allocating a clean root-entry table.
3061 * This might cause DMAR faults, but
3062 * probably the dump will still succeed.
3064 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3066 iommu_disable_translation(iommu);
3067 clear_translation_pre_enabled(iommu);
3069 pr_info("Copied translation tables from previous kernel for %s\n",
3071 copied_tables = true;
3075 iommu_flush_write_buffer(iommu);
3076 iommu_set_root_entry(iommu);
3077 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3078 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3080 if (!ecap_pass_through(iommu->ecap))
3081 hw_pass_through = 0;
3084 if (iommu_pass_through)
3085 iommu_identity_mapping |= IDENTMAP_ALL;
3087 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3088 iommu_identity_mapping |= IDENTMAP_GFX;
3091 if (iommu_identity_mapping) {
3092 ret = si_domain_init(hw_pass_through);
3097 check_tylersburg_isoch();
3100 * If we copied translations from a previous kernel in the kdump
3101 * case, we can not assign the devices to domains now, as that
3102 * would eliminate the old mappings. So skip this part and defer
3103 * the assignment to device driver initialization time.
3109 * If pass through is not set or not enabled, setup context entries for
3110 * identity mappings for rmrr, gfx, and isa and may fall back to static
3111 * identity mapping if iommu_identity_mapping is set.
3113 if (iommu_identity_mapping) {
3114 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3116 pr_crit("Failed to setup IOMMU pass-through\n");
3122 * for each dev attached to rmrr
3124 * locate drhd for dev, alloc domain for dev
3125 * allocate free domain
3126 * allocate page table entries for rmrr
3127 * if context not allocated for bus
3128 * allocate and init context
3129 * set present in root table for this bus
3130 * init context with domain, translation etc
3134 pr_info("Setting RMRR:\n");
3135 for_each_rmrr_units(rmrr) {
3136 /* some BIOS lists non-exist devices in DMAR table. */
3137 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3139 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3141 pr_err("Mapping reserved region failed\n");
3145 iommu_prepare_isa();
3152 * global invalidate context cache
3153 * global invalidate iotlb
3154 * enable translation
3156 for_each_iommu(iommu, drhd) {
3157 if (drhd->ignored) {
3159 * we always have to disable PMRs or DMA may fail on
3163 iommu_disable_protect_mem_regions(iommu);
3167 iommu_flush_write_buffer(iommu);
3169 ret = dmar_set_interrupt(iommu);
3173 if (!translation_pre_enabled(iommu))
3174 iommu_enable_translation(iommu);
3176 iommu_disable_protect_mem_regions(iommu);
3182 for_each_active_iommu(iommu, drhd) {
3183 disable_dmar_iommu(iommu);
3184 free_dmar_iommu(iommu);
3186 kfree(deferred_flush);
3193 /* This takes a number of _MM_ pages, not VTD pages */
3194 static struct iova *intel_alloc_iova(struct device *dev,
3195 struct dmar_domain *domain,
3196 unsigned long nrpages, uint64_t dma_mask)
3198 struct iova *iova = NULL;
3200 /* Restrict dma_mask to the width that the iommu can handle */
3201 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3203 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3205 * First try to allocate an io virtual address in
3206 * DMA_BIT_MASK(32) and if that fails then try allocating
3209 iova = alloc_iova(&domain->iovad, nrpages,
3210 IOVA_PFN(DMA_BIT_MASK(32)), 1);
3214 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3215 if (unlikely(!iova)) {
3216 pr_err("Allocating %ld-page iova for %s failed",
3217 nrpages, dev_name(dev));
3224 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3226 struct dmar_domain *domain;
3228 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3230 pr_err("Allocating domain for %s failed\n",
3238 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3240 struct device_domain_info *info;
3242 /* No lock here, assumes no domain exit in normal case */
3243 info = dev->archdata.iommu;
3245 return info->domain;
3247 return __get_valid_domain_for_dev(dev);
3250 /* Check if the dev needs to go through non-identity map and unmap process.*/
3251 static int iommu_no_mapping(struct device *dev)
3255 if (iommu_dummy(dev))
3258 if (!iommu_identity_mapping)
3261 found = identity_mapping(dev);
3263 if (iommu_should_identity_map(dev, 0))
3267 * 32 bit DMA is removed from si_domain and fall back
3268 * to non-identity mapping.
3270 dmar_remove_one_dev_info(si_domain, dev);
3271 pr_info("32bit %s uses non-identity mapping\n",
3277 * In case of a detached 64 bit DMA device from vm, the device
3278 * is put into si_domain for identity mapping.
3280 if (iommu_should_identity_map(dev, 0)) {
3282 ret = domain_add_dev_info(si_domain, dev);
3284 pr_info("64bit %s uses identity mapping\n",
3294 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3295 size_t size, int dir, u64 dma_mask)
3297 struct dmar_domain *domain;
3298 phys_addr_t start_paddr;
3302 struct intel_iommu *iommu;
3303 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3305 BUG_ON(dir == DMA_NONE);
3307 if (iommu_no_mapping(dev))
3310 domain = get_valid_domain_for_dev(dev);
3314 iommu = domain_get_iommu(domain);
3315 size = aligned_nrpages(paddr, size);
3317 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3322 * Check if DMAR supports zero-length reads on write only
3325 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3326 !cap_zlr(iommu->cap))
3327 prot |= DMA_PTE_READ;
3328 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3329 prot |= DMA_PTE_WRITE;
3331 * paddr - (paddr + size) might be partial page, we should map the whole
3332 * page. Note: if two part of one page are separately mapped, we
3333 * might have two guest_addr mapping to the same host paddr, but this
3334 * is not a big problem
3336 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3337 mm_to_dma_pfn(paddr_pfn), size, prot);
3341 /* it's a non-present to present mapping. Only flush if caching mode */
3342 if (cap_caching_mode(iommu->cap))
3343 iommu_flush_iotlb_psi(iommu, domain,
3344 mm_to_dma_pfn(iova->pfn_lo),
3347 iommu_flush_write_buffer(iommu);
3349 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3350 start_paddr += paddr & ~PAGE_MASK;
3355 __free_iova(&domain->iovad, iova);
3356 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3357 dev_name(dev), size, (unsigned long long)paddr, dir);
3361 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3362 unsigned long offset, size_t size,
3363 enum dma_data_direction dir,
3364 struct dma_attrs *attrs)
3366 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3367 dir, *dev->dma_mask);
3370 static void flush_unmaps(void)
3376 /* just flush them all */
3377 for (i = 0; i < g_num_of_iommus; i++) {
3378 struct intel_iommu *iommu = g_iommus[i];
3382 if (!deferred_flush[i].next)
3385 /* In caching mode, global flushes turn emulation expensive */
3386 if (!cap_caching_mode(iommu->cap))
3387 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3388 DMA_TLB_GLOBAL_FLUSH);
3389 for (j = 0; j < deferred_flush[i].next; j++) {
3391 struct iova *iova = deferred_flush[i].iova[j];
3392 struct dmar_domain *domain = deferred_flush[i].domain[j];
3394 /* On real hardware multiple invalidations are expensive */
3395 if (cap_caching_mode(iommu->cap))
3396 iommu_flush_iotlb_psi(iommu, domain,
3397 iova->pfn_lo, iova_size(iova),
3398 !deferred_flush[i].freelist[j], 0);
3400 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3401 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3402 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3404 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3405 if (deferred_flush[i].freelist[j])
3406 dma_free_pagelist(deferred_flush[i].freelist[j]);
3408 deferred_flush[i].next = 0;
3414 static void flush_unmaps_timeout(unsigned long data)
3416 unsigned long flags;
3418 spin_lock_irqsave(&async_umap_flush_lock, flags);
3420 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3423 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3425 unsigned long flags;
3427 struct intel_iommu *iommu;
3429 spin_lock_irqsave(&async_umap_flush_lock, flags);
3430 if (list_size == HIGH_WATER_MARK)
3433 iommu = domain_get_iommu(dom);
3434 iommu_id = iommu->seq_id;
3436 next = deferred_flush[iommu_id].next;
3437 deferred_flush[iommu_id].domain[next] = dom;
3438 deferred_flush[iommu_id].iova[next] = iova;
3439 deferred_flush[iommu_id].freelist[next] = freelist;
3440 deferred_flush[iommu_id].next++;
3443 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3447 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3450 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3452 struct dmar_domain *domain;
3453 unsigned long start_pfn, last_pfn;
3455 struct intel_iommu *iommu;
3456 struct page *freelist;
3458 if (iommu_no_mapping(dev))
3461 domain = find_domain(dev);
3464 iommu = domain_get_iommu(domain);
3466 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3467 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3468 (unsigned long long)dev_addr))
3471 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3472 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3474 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3475 dev_name(dev), start_pfn, last_pfn);
3477 freelist = domain_unmap(domain, start_pfn, last_pfn);
3479 if (intel_iommu_strict) {
3480 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3481 last_pfn - start_pfn + 1, !freelist, 0);
3483 __free_iova(&domain->iovad, iova);
3484 dma_free_pagelist(freelist);
3486 add_unmap(domain, iova, freelist);
3488 * queue up the release of the unmap to save the 1/6th of the
3489 * cpu used up by the iotlb flush operation...
3494 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3495 size_t size, enum dma_data_direction dir,
3496 struct dma_attrs *attrs)
3498 intel_unmap(dev, dev_addr);
3501 static void *intel_alloc_coherent(struct device *dev, size_t size,
3502 dma_addr_t *dma_handle, gfp_t flags,
3503 struct dma_attrs *attrs)
3505 struct page *page = NULL;
3508 size = PAGE_ALIGN(size);
3509 order = get_order(size);
3511 if (!iommu_no_mapping(dev))
3512 flags &= ~(GFP_DMA | GFP_DMA32);
3513 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3514 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3520 if (flags & __GFP_WAIT) {
3521 unsigned int count = size >> PAGE_SHIFT;
3523 page = dma_alloc_from_contiguous(dev, count, order);
3524 if (page && iommu_no_mapping(dev) &&
3525 page_to_phys(page) + size > dev->coherent_dma_mask) {
3526 dma_release_from_contiguous(dev, page, count);
3532 page = alloc_pages(flags, order);
3535 memset(page_address(page), 0, size);
3537 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3539 dev->coherent_dma_mask);
3541 return page_address(page);
3542 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3543 __free_pages(page, order);
3548 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3549 dma_addr_t dma_handle, struct dma_attrs *attrs)
3552 struct page *page = virt_to_page(vaddr);
3554 size = PAGE_ALIGN(size);
3555 order = get_order(size);
3557 intel_unmap(dev, dma_handle);
3558 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3559 __free_pages(page, order);
3562 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3563 int nelems, enum dma_data_direction dir,
3564 struct dma_attrs *attrs)
3566 intel_unmap(dev, sglist[0].dma_address);
3569 static int intel_nontranslate_map_sg(struct device *hddev,
3570 struct scatterlist *sglist, int nelems, int dir)
3573 struct scatterlist *sg;
3575 for_each_sg(sglist, sg, nelems, i) {
3576 BUG_ON(!sg_page(sg));
3577 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3578 sg->dma_length = sg->length;
3583 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3584 enum dma_data_direction dir, struct dma_attrs *attrs)
3587 struct dmar_domain *domain;
3590 struct iova *iova = NULL;
3592 struct scatterlist *sg;
3593 unsigned long start_vpfn;
3594 struct intel_iommu *iommu;
3596 BUG_ON(dir == DMA_NONE);
3597 if (iommu_no_mapping(dev))
3598 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3600 domain = get_valid_domain_for_dev(dev);
3604 iommu = domain_get_iommu(domain);
3606 for_each_sg(sglist, sg, nelems, i)
3607 size += aligned_nrpages(sg->offset, sg->length);
3609 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3612 sglist->dma_length = 0;
3617 * Check if DMAR supports zero-length reads on write only
3620 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3621 !cap_zlr(iommu->cap))
3622 prot |= DMA_PTE_READ;
3623 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3624 prot |= DMA_PTE_WRITE;
3626 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3628 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3629 if (unlikely(ret)) {
3630 dma_pte_free_pagetable(domain, start_vpfn,
3631 start_vpfn + size - 1);
3632 __free_iova(&domain->iovad, iova);
3636 /* it's a non-present to present mapping. Only flush if caching mode */
3637 if (cap_caching_mode(iommu->cap))
3638 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3640 iommu_flush_write_buffer(iommu);
3645 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3650 struct dma_map_ops intel_dma_ops = {
3651 .alloc = intel_alloc_coherent,
3652 .free = intel_free_coherent,
3653 .map_sg = intel_map_sg,
3654 .unmap_sg = intel_unmap_sg,
3655 .map_page = intel_map_page,
3656 .unmap_page = intel_unmap_page,
3657 .mapping_error = intel_mapping_error,
3660 static inline int iommu_domain_cache_init(void)
3664 iommu_domain_cache = kmem_cache_create("iommu_domain",
3665 sizeof(struct dmar_domain),
3670 if (!iommu_domain_cache) {
3671 pr_err("Couldn't create iommu_domain cache\n");
3678 static inline int iommu_devinfo_cache_init(void)
3682 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3683 sizeof(struct device_domain_info),
3687 if (!iommu_devinfo_cache) {
3688 pr_err("Couldn't create devinfo cache\n");
3695 static int __init iommu_init_mempool(void)
3698 ret = iommu_iova_cache_init();
3702 ret = iommu_domain_cache_init();
3706 ret = iommu_devinfo_cache_init();
3710 kmem_cache_destroy(iommu_domain_cache);
3712 iommu_iova_cache_destroy();
3717 static void __init iommu_exit_mempool(void)
3719 kmem_cache_destroy(iommu_devinfo_cache);
3720 kmem_cache_destroy(iommu_domain_cache);
3721 iommu_iova_cache_destroy();
3724 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3726 struct dmar_drhd_unit *drhd;
3730 /* We know that this device on this chipset has its own IOMMU.
3731 * If we find it under a different IOMMU, then the BIOS is lying
3732 * to us. Hope that the IOMMU for this device is actually
3733 * disabled, and it needs no translation...
3735 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3737 /* "can't" happen */
3738 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3741 vtbar &= 0xffff0000;
3743 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3744 drhd = dmar_find_matched_drhd_unit(pdev);
3745 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3746 TAINT_FIRMWARE_WORKAROUND,
3747 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3748 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3750 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3752 static void __init init_no_remapping_devices(void)
3754 struct dmar_drhd_unit *drhd;
3758 for_each_drhd_unit(drhd) {
3759 if (!drhd->include_all) {
3760 for_each_active_dev_scope(drhd->devices,
3761 drhd->devices_cnt, i, dev)
3763 /* ignore DMAR unit if no devices exist */
3764 if (i == drhd->devices_cnt)
3769 for_each_active_drhd_unit(drhd) {
3770 if (drhd->include_all)
3773 for_each_active_dev_scope(drhd->devices,
3774 drhd->devices_cnt, i, dev)
3775 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3777 if (i < drhd->devices_cnt)
3780 /* This IOMMU has *only* gfx devices. Either bypass it or
3781 set the gfx_mapped flag, as appropriate */
3783 intel_iommu_gfx_mapped = 1;
3786 for_each_active_dev_scope(drhd->devices,
3787 drhd->devices_cnt, i, dev)
3788 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3793 #ifdef CONFIG_SUSPEND
3794 static int init_iommu_hw(void)
3796 struct dmar_drhd_unit *drhd;
3797 struct intel_iommu *iommu = NULL;
3799 for_each_active_iommu(iommu, drhd)
3801 dmar_reenable_qi(iommu);
3803 for_each_iommu(iommu, drhd) {
3804 if (drhd->ignored) {
3806 * we always have to disable PMRs or DMA may fail on
3810 iommu_disable_protect_mem_regions(iommu);
3814 iommu_flush_write_buffer(iommu);
3816 iommu_set_root_entry(iommu);
3818 iommu->flush.flush_context(iommu, 0, 0, 0,
3819 DMA_CCMD_GLOBAL_INVL);
3820 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3821 iommu_enable_translation(iommu);
3822 iommu_disable_protect_mem_regions(iommu);
3828 static void iommu_flush_all(void)
3830 struct dmar_drhd_unit *drhd;
3831 struct intel_iommu *iommu;
3833 for_each_active_iommu(iommu, drhd) {
3834 iommu->flush.flush_context(iommu, 0, 0, 0,
3835 DMA_CCMD_GLOBAL_INVL);
3836 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3837 DMA_TLB_GLOBAL_FLUSH);
3841 static int iommu_suspend(void)
3843 struct dmar_drhd_unit *drhd;
3844 struct intel_iommu *iommu = NULL;
3847 for_each_active_iommu(iommu, drhd) {
3848 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3850 if (!iommu->iommu_state)
3856 for_each_active_iommu(iommu, drhd) {
3857 iommu_disable_translation(iommu);
3859 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3861 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3862 readl(iommu->reg + DMAR_FECTL_REG);
3863 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3864 readl(iommu->reg + DMAR_FEDATA_REG);
3865 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3866 readl(iommu->reg + DMAR_FEADDR_REG);
3867 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3868 readl(iommu->reg + DMAR_FEUADDR_REG);
3870 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3875 for_each_active_iommu(iommu, drhd)
3876 kfree(iommu->iommu_state);
3881 static void iommu_resume(void)
3883 struct dmar_drhd_unit *drhd;
3884 struct intel_iommu *iommu = NULL;
3887 if (init_iommu_hw()) {
3889 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3891 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3895 for_each_active_iommu(iommu, drhd) {
3897 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3899 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3900 iommu->reg + DMAR_FECTL_REG);
3901 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3902 iommu->reg + DMAR_FEDATA_REG);
3903 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3904 iommu->reg + DMAR_FEADDR_REG);
3905 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3906 iommu->reg + DMAR_FEUADDR_REG);
3908 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3911 for_each_active_iommu(iommu, drhd)
3912 kfree(iommu->iommu_state);
3915 static struct syscore_ops iommu_syscore_ops = {
3916 .resume = iommu_resume,
3917 .suspend = iommu_suspend,
3920 static void __init init_iommu_pm_ops(void)
3922 register_syscore_ops(&iommu_syscore_ops);
3926 static inline void init_iommu_pm_ops(void) {}
3927 #endif /* CONFIG_PM */
3930 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3932 struct acpi_dmar_reserved_memory *rmrr;
3933 struct dmar_rmrr_unit *rmrru;
3935 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3939 rmrru->hdr = header;
3940 rmrr = (struct acpi_dmar_reserved_memory *)header;
3941 rmrru->base_address = rmrr->base_address;
3942 rmrru->end_address = rmrr->end_address;
3943 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3944 ((void *)rmrr) + rmrr->header.length,
3945 &rmrru->devices_cnt);
3946 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3951 list_add(&rmrru->list, &dmar_rmrr_units);
3956 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3958 struct dmar_atsr_unit *atsru;
3959 struct acpi_dmar_atsr *tmp;
3961 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3962 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3963 if (atsr->segment != tmp->segment)
3965 if (atsr->header.length != tmp->header.length)
3967 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3974 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3976 struct acpi_dmar_atsr *atsr;
3977 struct dmar_atsr_unit *atsru;
3979 if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3982 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3983 atsru = dmar_find_atsr(atsr);
3987 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3992 * If memory is allocated from slab by ACPI _DSM method, we need to
3993 * copy the memory content because the memory buffer will be freed
3996 atsru->hdr = (void *)(atsru + 1);
3997 memcpy(atsru->hdr, hdr, hdr->length);
3998 atsru->include_all = atsr->flags & 0x1;
3999 if (!atsru->include_all) {
4000 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4001 (void *)atsr + atsr->header.length,
4002 &atsru->devices_cnt);
4003 if (atsru->devices_cnt && atsru->devices == NULL) {
4009 list_add_rcu(&atsru->list, &dmar_atsr_units);
4014 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4016 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4020 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4022 struct acpi_dmar_atsr *atsr;
4023 struct dmar_atsr_unit *atsru;
4025 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4026 atsru = dmar_find_atsr(atsr);
4028 list_del_rcu(&atsru->list);
4030 intel_iommu_free_atsr(atsru);
4036 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4040 struct acpi_dmar_atsr *atsr;
4041 struct dmar_atsr_unit *atsru;
4043 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4044 atsru = dmar_find_atsr(atsr);
4048 if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4049 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4056 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4059 struct intel_iommu *iommu = dmaru->iommu;
4061 if (g_iommus[iommu->seq_id])
4064 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4065 pr_warn("%s: Doesn't support hardware pass through.\n",
4069 if (!ecap_sc_support(iommu->ecap) &&
4070 domain_update_iommu_snooping(iommu)) {
4071 pr_warn("%s: Doesn't support snooping.\n",
4075 sp = domain_update_iommu_superpage(iommu) - 1;
4076 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4077 pr_warn("%s: Doesn't support large page.\n",
4083 * Disable translation if already enabled prior to OS handover.
4085 if (iommu->gcmd & DMA_GCMD_TE)
4086 iommu_disable_translation(iommu);
4088 g_iommus[iommu->seq_id] = iommu;
4089 ret = iommu_init_domains(iommu);
4091 ret = iommu_alloc_root_entry(iommu);
4095 if (dmaru->ignored) {
4097 * we always have to disable PMRs or DMA may fail on this device
4100 iommu_disable_protect_mem_regions(iommu);
4104 intel_iommu_init_qi(iommu);
4105 iommu_flush_write_buffer(iommu);
4106 ret = dmar_set_interrupt(iommu);
4110 iommu_set_root_entry(iommu);
4111 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4112 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4113 iommu_enable_translation(iommu);
4115 iommu_disable_protect_mem_regions(iommu);
4119 disable_dmar_iommu(iommu);
4121 free_dmar_iommu(iommu);
4125 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4128 struct intel_iommu *iommu = dmaru->iommu;
4130 if (!intel_iommu_enabled)
4136 ret = intel_iommu_add(dmaru);
4138 disable_dmar_iommu(iommu);
4139 free_dmar_iommu(iommu);
4145 static void intel_iommu_free_dmars(void)
4147 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4148 struct dmar_atsr_unit *atsru, *atsr_n;
4150 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4151 list_del(&rmrru->list);
4152 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4156 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4157 list_del(&atsru->list);
4158 intel_iommu_free_atsr(atsru);
4162 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4165 struct pci_bus *bus;
4166 struct pci_dev *bridge = NULL;
4168 struct acpi_dmar_atsr *atsr;
4169 struct dmar_atsr_unit *atsru;
4171 dev = pci_physfn(dev);
4172 for (bus = dev->bus; bus; bus = bus->parent) {
4174 if (!bridge || !pci_is_pcie(bridge) ||
4175 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4177 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4184 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4185 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4186 if (atsr->segment != pci_domain_nr(dev->bus))
4189 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4190 if (tmp == &bridge->dev)
4193 if (atsru->include_all)
4203 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4206 struct dmar_rmrr_unit *rmrru;
4207 struct dmar_atsr_unit *atsru;
4208 struct acpi_dmar_atsr *atsr;
4209 struct acpi_dmar_reserved_memory *rmrr;
4211 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4214 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4215 rmrr = container_of(rmrru->hdr,
4216 struct acpi_dmar_reserved_memory, header);
4217 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4218 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4219 ((void *)rmrr) + rmrr->header.length,
4220 rmrr->segment, rmrru->devices,
4221 rmrru->devices_cnt);
4224 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4225 dmar_remove_dev_scope(info, rmrr->segment,
4226 rmrru->devices, rmrru->devices_cnt);
4230 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4231 if (atsru->include_all)
4234 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4235 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4236 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4237 (void *)atsr + atsr->header.length,
4238 atsr->segment, atsru->devices,
4239 atsru->devices_cnt);
4244 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4245 if (dmar_remove_dev_scope(info, atsr->segment,
4246 atsru->devices, atsru->devices_cnt))
4255 * Here we only respond to action of unbound device from driver.
4257 * Added device is not attached to its DMAR domain here yet. That will happen
4258 * when mapping the device to iova.
4260 static int device_notifier(struct notifier_block *nb,
4261 unsigned long action, void *data)
4263 struct device *dev = data;
4264 struct dmar_domain *domain;
4266 if (iommu_dummy(dev))
4269 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4272 domain = find_domain(dev);
4276 dmar_remove_one_dev_info(domain, dev);
4277 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4278 domain_exit(domain);
4283 static struct notifier_block device_nb = {
4284 .notifier_call = device_notifier,
4287 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4288 unsigned long val, void *v)
4290 struct memory_notify *mhp = v;
4291 unsigned long long start, end;
4292 unsigned long start_vpfn, last_vpfn;
4295 case MEM_GOING_ONLINE:
4296 start = mhp->start_pfn << PAGE_SHIFT;
4297 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4298 if (iommu_domain_identity_map(si_domain, start, end)) {
4299 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4306 case MEM_CANCEL_ONLINE:
4307 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4308 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4309 while (start_vpfn <= last_vpfn) {
4311 struct dmar_drhd_unit *drhd;
4312 struct intel_iommu *iommu;
4313 struct page *freelist;
4315 iova = find_iova(&si_domain->iovad, start_vpfn);
4317 pr_debug("Failed get IOVA for PFN %lx\n",
4322 iova = split_and_remove_iova(&si_domain->iovad, iova,
4323 start_vpfn, last_vpfn);
4325 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4326 start_vpfn, last_vpfn);
4330 freelist = domain_unmap(si_domain, iova->pfn_lo,
4334 for_each_active_iommu(iommu, drhd)
4335 iommu_flush_iotlb_psi(iommu, si_domain,
4336 iova->pfn_lo, iova_size(iova),
4339 dma_free_pagelist(freelist);
4341 start_vpfn = iova->pfn_hi + 1;
4342 free_iova_mem(iova);
4350 static struct notifier_block intel_iommu_memory_nb = {
4351 .notifier_call = intel_iommu_memory_notifier,
4356 static ssize_t intel_iommu_show_version(struct device *dev,
4357 struct device_attribute *attr,
4360 struct intel_iommu *iommu = dev_get_drvdata(dev);
4361 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4362 return sprintf(buf, "%d:%d\n",
4363 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4365 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4367 static ssize_t intel_iommu_show_address(struct device *dev,
4368 struct device_attribute *attr,
4371 struct intel_iommu *iommu = dev_get_drvdata(dev);
4372 return sprintf(buf, "%llx\n", iommu->reg_phys);
4374 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4376 static ssize_t intel_iommu_show_cap(struct device *dev,
4377 struct device_attribute *attr,
4380 struct intel_iommu *iommu = dev_get_drvdata(dev);
4381 return sprintf(buf, "%llx\n", iommu->cap);
4383 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4385 static ssize_t intel_iommu_show_ecap(struct device *dev,
4386 struct device_attribute *attr,
4389 struct intel_iommu *iommu = dev_get_drvdata(dev);
4390 return sprintf(buf, "%llx\n", iommu->ecap);
4392 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4394 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4395 struct device_attribute *attr,
4398 struct intel_iommu *iommu = dev_get_drvdata(dev);
4399 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4401 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4403 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4404 struct device_attribute *attr,
4407 struct intel_iommu *iommu = dev_get_drvdata(dev);
4408 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4409 cap_ndoms(iommu->cap)));
4411 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4413 static struct attribute *intel_iommu_attrs[] = {
4414 &dev_attr_version.attr,
4415 &dev_attr_address.attr,
4417 &dev_attr_ecap.attr,
4418 &dev_attr_domains_supported.attr,
4419 &dev_attr_domains_used.attr,
4423 static struct attribute_group intel_iommu_group = {
4424 .name = "intel-iommu",
4425 .attrs = intel_iommu_attrs,
4428 const struct attribute_group *intel_iommu_groups[] = {
4433 int __init intel_iommu_init(void)
4436 struct dmar_drhd_unit *drhd;
4437 struct intel_iommu *iommu;
4439 /* VT-d is required for a TXT/tboot launch, so enforce that */
4440 force_on = tboot_force_iommu();
4442 if (iommu_init_mempool()) {
4444 panic("tboot: Failed to initialize iommu memory\n");
4448 down_write(&dmar_global_lock);
4449 if (dmar_table_init()) {
4451 panic("tboot: Failed to initialize DMAR table\n");
4455 if (dmar_dev_scope_init() < 0) {
4457 panic("tboot: Failed to initialize DMAR device scope\n");
4461 if (no_iommu || dmar_disabled)
4464 if (list_empty(&dmar_rmrr_units))
4465 pr_info("No RMRR found\n");
4467 if (list_empty(&dmar_atsr_units))
4468 pr_info("No ATSR found\n");
4470 if (dmar_init_reserved_ranges()) {
4472 panic("tboot: Failed to reserve iommu ranges\n");
4473 goto out_free_reserved_range;
4476 init_no_remapping_devices();
4481 panic("tboot: Failed to initialize DMARs\n");
4482 pr_err("Initialization failed\n");
4483 goto out_free_reserved_range;
4485 up_write(&dmar_global_lock);
4486 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4488 init_timer(&unmap_timer);
4489 #ifdef CONFIG_SWIOTLB
4492 dma_ops = &intel_dma_ops;
4494 init_iommu_pm_ops();
4496 for_each_active_iommu(iommu, drhd)
4497 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4501 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4502 bus_register_notifier(&pci_bus_type, &device_nb);
4503 if (si_domain && !hw_pass_through)
4504 register_memory_notifier(&intel_iommu_memory_nb);
4506 intel_iommu_enabled = 1;
4510 out_free_reserved_range:
4511 put_iova_domain(&reserved_iova_list);
4513 intel_iommu_free_dmars();
4514 up_write(&dmar_global_lock);
4515 iommu_exit_mempool();
4519 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4521 struct intel_iommu *iommu = opaque;
4523 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4528 * NB - intel-iommu lacks any sort of reference counting for the users of
4529 * dependent devices. If multiple endpoints have intersecting dependent
4530 * devices, unbinding the driver from any one of them will possibly leave
4531 * the others unable to operate.
4533 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4535 if (!iommu || !dev || !dev_is_pci(dev))
4538 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4541 static void __dmar_remove_one_dev_info(struct dmar_domain *domain,
4544 struct device_domain_info *info;
4545 struct intel_iommu *iommu;
4546 unsigned long flags;
4549 assert_spin_locked(&device_domain_lock);
4551 iommu = device_to_iommu(dev, &bus, &devfn);
4555 info = dev->archdata.iommu;
4560 unlink_domain_info(info);
4562 iommu_disable_dev_iotlb(info);
4563 domain_context_clear(iommu, dev);
4564 free_devinfo_mem(info);
4566 spin_lock_irqsave(&iommu->lock, flags);
4567 domain_detach_iommu(domain, iommu);
4568 spin_unlock_irqrestore(&iommu->lock, flags);
4571 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4574 unsigned long flags;
4576 spin_lock_irqsave(&device_domain_lock, flags);
4577 __dmar_remove_one_dev_info(domain, dev);
4578 spin_unlock_irqrestore(&device_domain_lock, flags);
4581 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4585 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4587 domain_reserve_special_ranges(domain);
4589 /* calculate AGAW */
4590 domain->gaw = guest_width;
4591 adjust_width = guestwidth_to_adjustwidth(guest_width);
4592 domain->agaw = width_to_agaw(adjust_width);
4594 domain->iommu_coherency = 0;
4595 domain->iommu_snooping = 0;
4596 domain->iommu_superpage = 0;
4597 domain->max_addr = 0;
4599 /* always allocate the top pgd */
4600 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4603 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4607 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4609 struct dmar_domain *dmar_domain;
4610 struct iommu_domain *domain;
4612 if (type != IOMMU_DOMAIN_UNMANAGED)
4615 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4617 pr_err("Can't allocate dmar_domain\n");
4620 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4621 pr_err("Domain initialization failed\n");
4622 domain_exit(dmar_domain);
4625 domain_update_iommu_cap(dmar_domain);
4627 domain = &dmar_domain->domain;
4628 domain->geometry.aperture_start = 0;
4629 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4630 domain->geometry.force_aperture = true;
4635 static void intel_iommu_domain_free(struct iommu_domain *domain)
4637 domain_exit(to_dmar_domain(domain));
4640 static int intel_iommu_attach_device(struct iommu_domain *domain,
4643 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4644 struct intel_iommu *iommu;
4648 if (device_is_rmrr_locked(dev)) {
4649 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4653 /* normally dev is not mapped */
4654 if (unlikely(domain_context_mapped(dev))) {
4655 struct dmar_domain *old_domain;
4657 old_domain = find_domain(dev);
4660 dmar_remove_one_dev_info(old_domain, dev);
4663 if (!domain_type_is_vm_or_si(old_domain) &&
4664 list_empty(&old_domain->devices))
4665 domain_exit(old_domain);
4669 iommu = device_to_iommu(dev, &bus, &devfn);
4673 /* check if this iommu agaw is sufficient for max mapped address */
4674 addr_width = agaw_to_width(iommu->agaw);
4675 if (addr_width > cap_mgaw(iommu->cap))
4676 addr_width = cap_mgaw(iommu->cap);
4678 if (dmar_domain->max_addr > (1LL << addr_width)) {
4679 pr_err("%s: iommu width (%d) is not "
4680 "sufficient for the mapped address (%llx)\n",
4681 __func__, addr_width, dmar_domain->max_addr);
4684 dmar_domain->gaw = addr_width;
4687 * Knock out extra levels of page tables if necessary
4689 while (iommu->agaw < dmar_domain->agaw) {
4690 struct dma_pte *pte;
4692 pte = dmar_domain->pgd;
4693 if (dma_pte_present(pte)) {
4694 dmar_domain->pgd = (struct dma_pte *)
4695 phys_to_virt(dma_pte_addr(pte));
4696 free_pgtable_page(pte);
4698 dmar_domain->agaw--;
4701 return domain_add_dev_info(dmar_domain, dev);
4704 static void intel_iommu_detach_device(struct iommu_domain *domain,
4707 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4710 static int intel_iommu_map(struct iommu_domain *domain,
4711 unsigned long iova, phys_addr_t hpa,
4712 size_t size, int iommu_prot)
4714 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4719 if (iommu_prot & IOMMU_READ)
4720 prot |= DMA_PTE_READ;
4721 if (iommu_prot & IOMMU_WRITE)
4722 prot |= DMA_PTE_WRITE;
4723 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4724 prot |= DMA_PTE_SNP;
4726 max_addr = iova + size;
4727 if (dmar_domain->max_addr < max_addr) {
4730 /* check if minimum agaw is sufficient for mapped address */
4731 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4732 if (end < max_addr) {
4733 pr_err("%s: iommu width (%d) is not "
4734 "sufficient for the mapped address (%llx)\n",
4735 __func__, dmar_domain->gaw, max_addr);
4738 dmar_domain->max_addr = max_addr;
4740 /* Round up size to next multiple of PAGE_SIZE, if it and
4741 the low bits of hpa would take us onto the next page */
4742 size = aligned_nrpages(hpa, size);
4743 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4744 hpa >> VTD_PAGE_SHIFT, size, prot);
4748 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4749 unsigned long iova, size_t size)
4751 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4752 struct page *freelist = NULL;
4753 struct intel_iommu *iommu;
4754 unsigned long start_pfn, last_pfn;
4755 unsigned int npages;
4756 int iommu_id, level = 0;
4758 /* Cope with horrid API which requires us to unmap more than the
4759 size argument if it happens to be a large-page mapping. */
4760 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4763 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4764 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4766 start_pfn = iova >> VTD_PAGE_SHIFT;
4767 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4769 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4771 npages = last_pfn - start_pfn + 1;
4773 for_each_domain_iommu(iommu_id, dmar_domain) {
4774 iommu = g_iommus[iommu_id];
4776 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4777 start_pfn, npages, !freelist, 0);
4780 dma_free_pagelist(freelist);
4782 if (dmar_domain->max_addr == iova + size)
4783 dmar_domain->max_addr = iova;
4788 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4791 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4792 struct dma_pte *pte;
4796 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4798 phys = dma_pte_addr(pte);
4803 static bool intel_iommu_capable(enum iommu_cap cap)
4805 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4806 return domain_update_iommu_snooping(NULL) == 1;
4807 if (cap == IOMMU_CAP_INTR_REMAP)
4808 return irq_remapping_enabled == 1;
4813 static int intel_iommu_add_device(struct device *dev)
4815 struct intel_iommu *iommu;
4816 struct iommu_group *group;
4819 iommu = device_to_iommu(dev, &bus, &devfn);
4823 iommu_device_link(iommu->iommu_dev, dev);
4825 group = iommu_group_get_for_dev(dev);
4828 return PTR_ERR(group);
4830 iommu_group_put(group);
4834 static void intel_iommu_remove_device(struct device *dev)
4836 struct intel_iommu *iommu;
4839 iommu = device_to_iommu(dev, &bus, &devfn);
4843 iommu_group_remove_device(dev);
4845 iommu_device_unlink(iommu->iommu_dev, dev);
4848 static const struct iommu_ops intel_iommu_ops = {
4849 .capable = intel_iommu_capable,
4850 .domain_alloc = intel_iommu_domain_alloc,
4851 .domain_free = intel_iommu_domain_free,
4852 .attach_dev = intel_iommu_attach_device,
4853 .detach_dev = intel_iommu_detach_device,
4854 .map = intel_iommu_map,
4855 .unmap = intel_iommu_unmap,
4856 .map_sg = default_iommu_map_sg,
4857 .iova_to_phys = intel_iommu_iova_to_phys,
4858 .add_device = intel_iommu_add_device,
4859 .remove_device = intel_iommu_remove_device,
4860 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4863 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4865 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4866 pr_info("Disabling IOMMU for graphics on this chipset\n");
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4878 static void quirk_iommu_rwbf(struct pci_dev *dev)
4881 * Mobile 4 Series Chipset neglects to set RWBF capability,
4882 * but needs it. Same seems to hold for the desktop versions.
4884 pr_info("Forcing write-buffer flush capability\n");
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4897 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4898 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4899 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4900 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4901 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4902 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4903 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4904 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4906 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4910 if (pci_read_config_word(dev, GGC, &ggc))
4913 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4914 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4916 } else if (dmar_map_gfx) {
4917 /* we have to ensure the gfx device is idle before we flush */
4918 pr_info("Disabling batched IOTLB flush on Ironlake\n");
4919 intel_iommu_strict = 1;
4922 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4927 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4928 ISOCH DMAR unit for the Azalia sound device, but not give it any
4929 TLB entries, which causes it to deadlock. Check for that. We do
4930 this in a function called from init_dmars(), instead of in a PCI
4931 quirk, because we don't want to print the obnoxious "BIOS broken"
4932 message if VT-d is actually disabled.
4934 static void __init check_tylersburg_isoch(void)
4936 struct pci_dev *pdev;
4937 uint32_t vtisochctrl;
4939 /* If there's no Azalia in the system anyway, forget it. */
4940 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4945 /* System Management Registers. Might be hidden, in which case
4946 we can't do the sanity check. But that's OK, because the
4947 known-broken BIOSes _don't_ actually hide it, so far. */
4948 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4952 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4959 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4960 if (vtisochctrl & 1)
4963 /* Drop all bits other than the number of TLB entries */
4964 vtisochctrl &= 0x1c;
4966 /* If we have the recommended number of TLB entries (16), fine. */
4967 if (vtisochctrl == 0x10)
4970 /* Zero TLB entries? You get to ride the short bus to school. */
4972 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4973 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4974 dmi_get_system_info(DMI_BIOS_VENDOR),
4975 dmi_get_system_info(DMI_BIOS_VERSION),
4976 dmi_get_system_info(DMI_PRODUCT_VERSION));
4977 iommu_identity_mapping |= IDENTMAP_AZALIA;
4981 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",