2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
42 #define ROOT_SIZE VTD_PAGE_SIZE
43 #define CONTEXT_SIZE VTD_PAGE_SIZE
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48 #define IOAPIC_RANGE_START (0xfee00000)
49 #define IOAPIC_RANGE_END (0xfeefffff)
50 #define IOVA_START_ADDR (0x1000)
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
63 * 12-63: Context Ptr (12 - (haw-1))
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
73 return (root->val & 1);
75 static inline void set_root_present(struct root_entry *root)
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
81 root->val |= value & VTD_PAGE_MASK;
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
87 return (struct context_entry *)
88 (root_present(root)?phys_to_virt(
89 root->val & VTD_PAGE_MASK) :
96 * 1: fault processing disable
97 * 2-3: translation type
98 * 12-63: address space root
104 struct context_entry {
109 static inline bool context_present(struct context_entry *context)
111 return (context->lo & 1);
113 static inline void context_set_present(struct context_entry *context)
118 static inline void context_set_fault_enable(struct context_entry *context)
120 context->lo &= (((u64)-1) << 2) | 1;
123 #define CONTEXT_TT_MULTI_LEVEL 0
125 static inline void context_set_translation_type(struct context_entry *context,
128 context->lo &= (((u64)-1) << 4) | 3;
129 context->lo |= (value & 3) << 2;
132 static inline void context_set_address_root(struct context_entry *context,
135 context->lo |= value & VTD_PAGE_MASK;
138 static inline void context_set_address_width(struct context_entry *context,
141 context->hi |= value & 7;
144 static inline void context_set_domain_id(struct context_entry *context,
147 context->hi |= (value & ((1 << 16) - 1)) << 8;
150 static inline void context_clear_entry(struct context_entry *context)
162 * 12-63: Host physcial address
168 static inline void dma_clear_pte(struct dma_pte *pte)
173 static inline void dma_set_pte_readable(struct dma_pte *pte)
175 pte->val |= DMA_PTE_READ;
178 static inline void dma_set_pte_writable(struct dma_pte *pte)
180 pte->val |= DMA_PTE_WRITE;
183 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
185 pte->val = (pte->val & ~3) | (prot & 3);
188 static inline u64 dma_pte_addr(struct dma_pte *pte)
190 return (pte->val & VTD_PAGE_MASK);
193 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
195 pte->val |= (addr & VTD_PAGE_MASK);
198 static inline bool dma_pte_present(struct dma_pte *pte)
200 return (pte->val & 3) != 0;
204 int id; /* domain id */
205 struct intel_iommu *iommu; /* back pointer to owning iommu */
207 struct list_head devices; /* all devices' list */
208 struct iova_domain iovad; /* iova's that belong to this domain */
210 struct dma_pte *pgd; /* virtual address */
211 spinlock_t mapping_lock; /* page table lock */
212 int gaw; /* max guest address width */
214 /* adjusted guest address width, 0 is level 2 30-bit */
217 #define DOMAIN_FLAG_MULTIPLE_DEVICES 1
221 /* PCI domain-device relationship */
222 struct device_domain_info {
223 struct list_head link; /* link to domain siblings */
224 struct list_head global; /* link to global list */
225 u8 bus; /* PCI bus numer */
226 u8 devfn; /* PCI devfn number */
227 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
228 struct dmar_domain *domain; /* pointer to domain */
231 static void flush_unmaps_timeout(unsigned long data);
233 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
235 #define HIGH_WATER_MARK 250
236 struct deferred_flush_tables {
238 struct iova *iova[HIGH_WATER_MARK];
239 struct dmar_domain *domain[HIGH_WATER_MARK];
242 static struct deferred_flush_tables *deferred_flush;
244 /* bitmap for indexing intel_iommus */
245 static int g_num_of_iommus;
247 static DEFINE_SPINLOCK(async_umap_flush_lock);
248 static LIST_HEAD(unmaps_to_do);
251 static long list_size;
253 static void domain_remove_dev_info(struct dmar_domain *domain);
256 static int __initdata dmar_map_gfx = 1;
257 static int dmar_forcedac;
258 static int intel_iommu_strict;
260 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
261 static DEFINE_SPINLOCK(device_domain_lock);
262 static LIST_HEAD(device_domain_list);
264 static int __init intel_iommu_setup(char *str)
269 if (!strncmp(str, "off", 3)) {
271 printk(KERN_INFO"Intel-IOMMU: disabled\n");
272 } else if (!strncmp(str, "igfx_off", 8)) {
275 "Intel-IOMMU: disable GFX device mapping\n");
276 } else if (!strncmp(str, "forcedac", 8)) {
278 "Intel-IOMMU: Forcing DAC for PCI devices\n");
280 } else if (!strncmp(str, "strict", 6)) {
282 "Intel-IOMMU: disable batched IOTLB flush\n");
283 intel_iommu_strict = 1;
286 str += strcspn(str, ",");
292 __setup("intel_iommu=", intel_iommu_setup);
294 static struct kmem_cache *iommu_domain_cache;
295 static struct kmem_cache *iommu_devinfo_cache;
296 static struct kmem_cache *iommu_iova_cache;
298 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
303 /* trying to avoid low memory issues */
304 flags = current->flags & PF_MEMALLOC;
305 current->flags |= PF_MEMALLOC;
306 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
307 current->flags &= (~PF_MEMALLOC | flags);
312 static inline void *alloc_pgtable_page(void)
317 /* trying to avoid low memory issues */
318 flags = current->flags & PF_MEMALLOC;
319 current->flags |= PF_MEMALLOC;
320 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
321 current->flags &= (~PF_MEMALLOC | flags);
325 static inline void free_pgtable_page(void *vaddr)
327 free_page((unsigned long)vaddr);
330 static inline void *alloc_domain_mem(void)
332 return iommu_kmem_cache_alloc(iommu_domain_cache);
335 static void free_domain_mem(void *vaddr)
337 kmem_cache_free(iommu_domain_cache, vaddr);
340 static inline void * alloc_devinfo_mem(void)
342 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
345 static inline void free_devinfo_mem(void *vaddr)
347 kmem_cache_free(iommu_devinfo_cache, vaddr);
350 struct iova *alloc_iova_mem(void)
352 return iommu_kmem_cache_alloc(iommu_iova_cache);
355 void free_iova_mem(struct iova *iova)
357 kmem_cache_free(iommu_iova_cache, iova);
360 /* Gets context entry for a given bus and devfn */
361 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
364 struct root_entry *root;
365 struct context_entry *context;
366 unsigned long phy_addr;
369 spin_lock_irqsave(&iommu->lock, flags);
370 root = &iommu->root_entry[bus];
371 context = get_context_addr_from_root(root);
373 context = (struct context_entry *)alloc_pgtable_page();
375 spin_unlock_irqrestore(&iommu->lock, flags);
378 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
379 phy_addr = virt_to_phys((void *)context);
380 set_root_value(root, phy_addr);
381 set_root_present(root);
382 __iommu_flush_cache(iommu, root, sizeof(*root));
384 spin_unlock_irqrestore(&iommu->lock, flags);
385 return &context[devfn];
388 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
390 struct root_entry *root;
391 struct context_entry *context;
395 spin_lock_irqsave(&iommu->lock, flags);
396 root = &iommu->root_entry[bus];
397 context = get_context_addr_from_root(root);
402 ret = context_present(&context[devfn]);
404 spin_unlock_irqrestore(&iommu->lock, flags);
408 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
410 struct root_entry *root;
411 struct context_entry *context;
414 spin_lock_irqsave(&iommu->lock, flags);
415 root = &iommu->root_entry[bus];
416 context = get_context_addr_from_root(root);
418 context_clear_entry(&context[devfn]);
419 __iommu_flush_cache(iommu, &context[devfn], \
422 spin_unlock_irqrestore(&iommu->lock, flags);
425 static void free_context_table(struct intel_iommu *iommu)
427 struct root_entry *root;
430 struct context_entry *context;
432 spin_lock_irqsave(&iommu->lock, flags);
433 if (!iommu->root_entry) {
436 for (i = 0; i < ROOT_ENTRY_NR; i++) {
437 root = &iommu->root_entry[i];
438 context = get_context_addr_from_root(root);
440 free_pgtable_page(context);
442 free_pgtable_page(iommu->root_entry);
443 iommu->root_entry = NULL;
445 spin_unlock_irqrestore(&iommu->lock, flags);
448 /* page table handling */
449 #define LEVEL_STRIDE (9)
450 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
452 static inline int agaw_to_level(int agaw)
457 static inline int agaw_to_width(int agaw)
459 return 30 + agaw * LEVEL_STRIDE;
463 static inline int width_to_agaw(int width)
465 return (width - 30) / LEVEL_STRIDE;
468 static inline unsigned int level_to_offset_bits(int level)
470 return (12 + (level - 1) * LEVEL_STRIDE);
473 static inline int address_level_offset(u64 addr, int level)
475 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
478 static inline u64 level_mask(int level)
480 return ((u64)-1 << level_to_offset_bits(level));
483 static inline u64 level_size(int level)
485 return ((u64)1 << level_to_offset_bits(level));
488 static inline u64 align_to_level(u64 addr, int level)
490 return ((addr + level_size(level) - 1) & level_mask(level));
493 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
495 int addr_width = agaw_to_width(domain->agaw);
496 struct dma_pte *parent, *pte = NULL;
497 int level = agaw_to_level(domain->agaw);
501 BUG_ON(!domain->pgd);
503 addr &= (((u64)1) << addr_width) - 1;
504 parent = domain->pgd;
506 spin_lock_irqsave(&domain->mapping_lock, flags);
510 offset = address_level_offset(addr, level);
511 pte = &parent[offset];
515 if (!dma_pte_present(pte)) {
516 tmp_page = alloc_pgtable_page();
519 spin_unlock_irqrestore(&domain->mapping_lock,
523 __iommu_flush_cache(domain->iommu, tmp_page,
525 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
527 * high level table always sets r/w, last level page
528 * table control read/write
530 dma_set_pte_readable(pte);
531 dma_set_pte_writable(pte);
532 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
534 parent = phys_to_virt(dma_pte_addr(pte));
538 spin_unlock_irqrestore(&domain->mapping_lock, flags);
542 /* return address's pte at specific level */
543 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
546 struct dma_pte *parent, *pte = NULL;
547 int total = agaw_to_level(domain->agaw);
550 parent = domain->pgd;
551 while (level <= total) {
552 offset = address_level_offset(addr, total);
553 pte = &parent[offset];
557 if (!dma_pte_present(pte))
559 parent = phys_to_virt(dma_pte_addr(pte));
565 /* clear one page's page table */
566 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
568 struct dma_pte *pte = NULL;
570 /* get last level pte */
571 pte = dma_addr_level_pte(domain, addr, 1);
575 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
579 /* clear last level pte, a tlb flush should be followed */
580 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
582 int addr_width = agaw_to_width(domain->agaw);
584 start &= (((u64)1) << addr_width) - 1;
585 end &= (((u64)1) << addr_width) - 1;
586 /* in case it's partial page */
587 start = PAGE_ALIGN(start);
590 /* we don't need lock here, nobody else touches the iova range */
591 while (start < end) {
592 dma_pte_clear_one(domain, start);
593 start += VTD_PAGE_SIZE;
597 /* free page table pages. last level pte should already be cleared */
598 static void dma_pte_free_pagetable(struct dmar_domain *domain,
601 int addr_width = agaw_to_width(domain->agaw);
603 int total = agaw_to_level(domain->agaw);
607 start &= (((u64)1) << addr_width) - 1;
608 end &= (((u64)1) << addr_width) - 1;
610 /* we don't need lock here, nobody else touches the iova range */
612 while (level <= total) {
613 tmp = align_to_level(start, level);
614 if (tmp >= end || (tmp + level_size(level) > end))
618 pte = dma_addr_level_pte(domain, tmp, level);
621 phys_to_virt(dma_pte_addr(pte)));
623 __iommu_flush_cache(domain->iommu,
626 tmp += level_size(level);
631 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
632 free_pgtable_page(domain->pgd);
638 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
640 struct root_entry *root;
643 root = (struct root_entry *)alloc_pgtable_page();
647 __iommu_flush_cache(iommu, root, ROOT_SIZE);
649 spin_lock_irqsave(&iommu->lock, flags);
650 iommu->root_entry = root;
651 spin_unlock_irqrestore(&iommu->lock, flags);
656 static void iommu_set_root_entry(struct intel_iommu *iommu)
662 addr = iommu->root_entry;
664 spin_lock_irqsave(&iommu->register_lock, flag);
665 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
667 cmd = iommu->gcmd | DMA_GCMD_SRTP;
668 writel(cmd, iommu->reg + DMAR_GCMD_REG);
670 /* Make sure hardware complete it */
671 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
672 readl, (sts & DMA_GSTS_RTPS), sts);
674 spin_unlock_irqrestore(&iommu->register_lock, flag);
677 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
682 if (!cap_rwbf(iommu->cap))
684 val = iommu->gcmd | DMA_GCMD_WBF;
686 spin_lock_irqsave(&iommu->register_lock, flag);
687 writel(val, iommu->reg + DMAR_GCMD_REG);
689 /* Make sure hardware complete it */
690 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
691 readl, (!(val & DMA_GSTS_WBFS)), val);
693 spin_unlock_irqrestore(&iommu->register_lock, flag);
696 /* return value determine if we need a write buffer flush */
697 static int __iommu_flush_context(struct intel_iommu *iommu,
698 u16 did, u16 source_id, u8 function_mask, u64 type,
699 int non_present_entry_flush)
705 * In the non-present entry flush case, if hardware doesn't cache
706 * non-present entry we do nothing and if hardware cache non-present
707 * entry, we flush entries of domain 0 (the domain id is used to cache
708 * any non-present entries)
710 if (non_present_entry_flush) {
711 if (!cap_caching_mode(iommu->cap))
718 case DMA_CCMD_GLOBAL_INVL:
719 val = DMA_CCMD_GLOBAL_INVL;
721 case DMA_CCMD_DOMAIN_INVL:
722 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
724 case DMA_CCMD_DEVICE_INVL:
725 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
726 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
733 spin_lock_irqsave(&iommu->register_lock, flag);
734 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
736 /* Make sure hardware complete it */
737 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
738 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
740 spin_unlock_irqrestore(&iommu->register_lock, flag);
742 /* flush context entry will implicitly flush write buffer */
746 /* return value determine if we need a write buffer flush */
747 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
748 u64 addr, unsigned int size_order, u64 type,
749 int non_present_entry_flush)
751 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
752 u64 val = 0, val_iva = 0;
756 * In the non-present entry flush case, if hardware doesn't cache
757 * non-present entry we do nothing and if hardware cache non-present
758 * entry, we flush entries of domain 0 (the domain id is used to cache
759 * any non-present entries)
761 if (non_present_entry_flush) {
762 if (!cap_caching_mode(iommu->cap))
769 case DMA_TLB_GLOBAL_FLUSH:
770 /* global flush doesn't need set IVA_REG */
771 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
773 case DMA_TLB_DSI_FLUSH:
774 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
776 case DMA_TLB_PSI_FLUSH:
777 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
778 /* Note: always flush non-leaf currently */
779 val_iva = size_order | addr;
784 /* Note: set drain read/write */
787 * This is probably to be super secure.. Looks like we can
788 * ignore it without any impact.
790 if (cap_read_drain(iommu->cap))
791 val |= DMA_TLB_READ_DRAIN;
793 if (cap_write_drain(iommu->cap))
794 val |= DMA_TLB_WRITE_DRAIN;
796 spin_lock_irqsave(&iommu->register_lock, flag);
797 /* Note: Only uses first TLB reg currently */
799 dmar_writeq(iommu->reg + tlb_offset, val_iva);
800 dmar_writeq(iommu->reg + tlb_offset + 8, val);
802 /* Make sure hardware complete it */
803 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
804 dmar_readq, (!(val & DMA_TLB_IVT)), val);
806 spin_unlock_irqrestore(&iommu->register_lock, flag);
808 /* check IOTLB invalidation granularity */
809 if (DMA_TLB_IAIG(val) == 0)
810 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
811 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
812 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
813 (unsigned long long)DMA_TLB_IIRG(type),
814 (unsigned long long)DMA_TLB_IAIG(val));
815 /* flush iotlb entry will implicitly flush write buffer */
819 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
820 u64 addr, unsigned int pages, int non_present_entry_flush)
824 BUG_ON(addr & (~VTD_PAGE_MASK));
827 /* Fallback to domain selective flush if no PSI support */
828 if (!cap_pgsel_inv(iommu->cap))
829 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
831 non_present_entry_flush);
834 * PSI requires page size to be 2 ^ x, and the base address is naturally
835 * aligned to the size
837 mask = ilog2(__roundup_pow_of_two(pages));
838 /* Fallback to domain selective flush if size is too big */
839 if (mask > cap_max_amask_val(iommu->cap))
840 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
841 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
843 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
845 non_present_entry_flush);
848 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
853 spin_lock_irqsave(&iommu->register_lock, flags);
854 pmen = readl(iommu->reg + DMAR_PMEN_REG);
855 pmen &= ~DMA_PMEN_EPM;
856 writel(pmen, iommu->reg + DMAR_PMEN_REG);
858 /* wait for the protected region status bit to clear */
859 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
860 readl, !(pmen & DMA_PMEN_PRS), pmen);
862 spin_unlock_irqrestore(&iommu->register_lock, flags);
865 static int iommu_enable_translation(struct intel_iommu *iommu)
870 spin_lock_irqsave(&iommu->register_lock, flags);
871 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
873 /* Make sure hardware complete it */
874 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
875 readl, (sts & DMA_GSTS_TES), sts);
877 iommu->gcmd |= DMA_GCMD_TE;
878 spin_unlock_irqrestore(&iommu->register_lock, flags);
882 static int iommu_disable_translation(struct intel_iommu *iommu)
887 spin_lock_irqsave(&iommu->register_lock, flag);
888 iommu->gcmd &= ~DMA_GCMD_TE;
889 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
891 /* Make sure hardware complete it */
892 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
893 readl, (!(sts & DMA_GSTS_TES)), sts);
895 spin_unlock_irqrestore(&iommu->register_lock, flag);
899 /* iommu interrupt handling. Most stuff are MSI-like. */
901 static const char *fault_reason_strings[] =
904 "Present bit in root entry is clear",
905 "Present bit in context entry is clear",
906 "Invalid context entry",
907 "Access beyond MGAW",
908 "PTE Write access is not set",
909 "PTE Read access is not set",
910 "Next page table ptr is invalid",
911 "Root table address invalid",
912 "Context table ptr is invalid",
913 "non-zero reserved fields in RTP",
914 "non-zero reserved fields in CTP",
915 "non-zero reserved fields in PTE",
917 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
919 const char *dmar_get_fault_reason(u8 fault_reason)
921 if (fault_reason > MAX_FAULT_REASON_IDX)
924 return fault_reason_strings[fault_reason];
927 void dmar_msi_unmask(unsigned int irq)
929 struct intel_iommu *iommu = get_irq_data(irq);
933 spin_lock_irqsave(&iommu->register_lock, flag);
934 writel(0, iommu->reg + DMAR_FECTL_REG);
935 /* Read a reg to force flush the post write */
936 readl(iommu->reg + DMAR_FECTL_REG);
937 spin_unlock_irqrestore(&iommu->register_lock, flag);
940 void dmar_msi_mask(unsigned int irq)
943 struct intel_iommu *iommu = get_irq_data(irq);
946 spin_lock_irqsave(&iommu->register_lock, flag);
947 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
948 /* Read a reg to force flush the post write */
949 readl(iommu->reg + DMAR_FECTL_REG);
950 spin_unlock_irqrestore(&iommu->register_lock, flag);
953 void dmar_msi_write(int irq, struct msi_msg *msg)
955 struct intel_iommu *iommu = get_irq_data(irq);
958 spin_lock_irqsave(&iommu->register_lock, flag);
959 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
960 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
961 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
962 spin_unlock_irqrestore(&iommu->register_lock, flag);
965 void dmar_msi_read(int irq, struct msi_msg *msg)
967 struct intel_iommu *iommu = get_irq_data(irq);
970 spin_lock_irqsave(&iommu->register_lock, flag);
971 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
972 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
973 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
974 spin_unlock_irqrestore(&iommu->register_lock, flag);
977 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
978 u8 fault_reason, u16 source_id, unsigned long long addr)
982 reason = dmar_get_fault_reason(fault_reason);
985 "DMAR:[%s] Request device [%02x:%02x.%d] "
987 "DMAR:[fault reason %02d] %s\n",
988 (type ? "DMA Read" : "DMA Write"),
989 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
990 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
994 #define PRIMARY_FAULT_REG_LEN (16)
995 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
997 struct intel_iommu *iommu = dev_id;
998 int reg, fault_index;
1002 spin_lock_irqsave(&iommu->register_lock, flag);
1003 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1005 /* TBD: ignore advanced fault log currently */
1006 if (!(fault_status & DMA_FSTS_PPF))
1007 goto clear_overflow;
1009 fault_index = dma_fsts_fault_record_index(fault_status);
1010 reg = cap_fault_reg_offset(iommu->cap);
1018 /* highest 32 bits */
1019 data = readl(iommu->reg + reg +
1020 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1021 if (!(data & DMA_FRCD_F))
1024 fault_reason = dma_frcd_fault_reason(data);
1025 type = dma_frcd_type(data);
1027 data = readl(iommu->reg + reg +
1028 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1029 source_id = dma_frcd_source_id(data);
1031 guest_addr = dmar_readq(iommu->reg + reg +
1032 fault_index * PRIMARY_FAULT_REG_LEN);
1033 guest_addr = dma_frcd_page_addr(guest_addr);
1034 /* clear the fault */
1035 writel(DMA_FRCD_F, iommu->reg + reg +
1036 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1038 spin_unlock_irqrestore(&iommu->register_lock, flag);
1040 iommu_page_fault_do_one(iommu, type, fault_reason,
1041 source_id, guest_addr);
1044 if (fault_index > cap_num_fault_regs(iommu->cap))
1046 spin_lock_irqsave(&iommu->register_lock, flag);
1049 /* clear primary fault overflow */
1050 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1051 if (fault_status & DMA_FSTS_PFO)
1052 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1054 spin_unlock_irqrestore(&iommu->register_lock, flag);
1058 int dmar_set_interrupt(struct intel_iommu *iommu)
1064 printk(KERN_ERR "IOMMU: no free vectors\n");
1068 set_irq_data(irq, iommu);
1071 ret = arch_setup_dmar_msi(irq);
1073 set_irq_data(irq, NULL);
1079 /* Force fault register is cleared */
1080 iommu_page_fault(irq, iommu);
1082 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1084 printk(KERN_ERR "IOMMU: can't request irq\n");
1088 static int iommu_init_domains(struct intel_iommu *iommu)
1090 unsigned long ndomains;
1091 unsigned long nlongs;
1093 ndomains = cap_ndoms(iommu->cap);
1094 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1095 nlongs = BITS_TO_LONGS(ndomains);
1097 /* TBD: there might be 64K domains,
1098 * consider other allocation for future chip
1100 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1101 if (!iommu->domain_ids) {
1102 printk(KERN_ERR "Allocating domain id array failed\n");
1105 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1107 if (!iommu->domains) {
1108 printk(KERN_ERR "Allocating domain array failed\n");
1109 kfree(iommu->domain_ids);
1113 spin_lock_init(&iommu->lock);
1116 * if Caching mode is set, then invalid translations are tagged
1117 * with domainid 0. Hence we need to pre-allocate it.
1119 if (cap_caching_mode(iommu->cap))
1120 set_bit(0, iommu->domain_ids);
1125 static void domain_exit(struct dmar_domain *domain);
1127 void free_dmar_iommu(struct intel_iommu *iommu)
1129 struct dmar_domain *domain;
1132 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1133 for (; i < cap_ndoms(iommu->cap); ) {
1134 domain = iommu->domains[i];
1135 clear_bit(i, iommu->domain_ids);
1136 domain_exit(domain);
1137 i = find_next_bit(iommu->domain_ids,
1138 cap_ndoms(iommu->cap), i+1);
1141 if (iommu->gcmd & DMA_GCMD_TE)
1142 iommu_disable_translation(iommu);
1145 set_irq_data(iommu->irq, NULL);
1146 /* This will mask the irq */
1147 free_irq(iommu->irq, iommu);
1148 destroy_irq(iommu->irq);
1151 kfree(iommu->domains);
1152 kfree(iommu->domain_ids);
1154 /* free context mapping */
1155 free_context_table(iommu);
1158 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1161 unsigned long ndomains;
1162 struct dmar_domain *domain;
1163 unsigned long flags;
1165 domain = alloc_domain_mem();
1169 ndomains = cap_ndoms(iommu->cap);
1171 spin_lock_irqsave(&iommu->lock, flags);
1172 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1173 if (num >= ndomains) {
1174 spin_unlock_irqrestore(&iommu->lock, flags);
1175 free_domain_mem(domain);
1176 printk(KERN_ERR "IOMMU: no free domain ids\n");
1180 set_bit(num, iommu->domain_ids);
1182 domain->iommu = iommu;
1183 iommu->domains[num] = domain;
1184 spin_unlock_irqrestore(&iommu->lock, flags);
1189 static void iommu_free_domain(struct dmar_domain *domain)
1191 unsigned long flags;
1193 spin_lock_irqsave(&domain->iommu->lock, flags);
1194 clear_bit(domain->id, domain->iommu->domain_ids);
1195 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1198 static struct iova_domain reserved_iova_list;
1199 static struct lock_class_key reserved_alloc_key;
1200 static struct lock_class_key reserved_rbtree_key;
1202 static void dmar_init_reserved_ranges(void)
1204 struct pci_dev *pdev = NULL;
1209 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1211 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1212 &reserved_alloc_key);
1213 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1214 &reserved_rbtree_key);
1216 /* IOAPIC ranges shouldn't be accessed by DMA */
1217 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1218 IOVA_PFN(IOAPIC_RANGE_END));
1220 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1222 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1223 for_each_pci_dev(pdev) {
1226 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1227 r = &pdev->resource[i];
1228 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1232 size = r->end - addr;
1233 size = PAGE_ALIGN(size);
1234 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1235 IOVA_PFN(size + addr) - 1);
1237 printk(KERN_ERR "Reserve iova failed\n");
1243 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1245 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1248 static inline int guestwidth_to_adjustwidth(int gaw)
1251 int r = (gaw - 12) % 9;
1262 static int domain_init(struct dmar_domain *domain, int guest_width)
1264 struct intel_iommu *iommu;
1265 int adjust_width, agaw;
1266 unsigned long sagaw;
1268 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1269 spin_lock_init(&domain->mapping_lock);
1271 domain_reserve_special_ranges(domain);
1273 /* calculate AGAW */
1274 iommu = domain->iommu;
1275 if (guest_width > cap_mgaw(iommu->cap))
1276 guest_width = cap_mgaw(iommu->cap);
1277 domain->gaw = guest_width;
1278 adjust_width = guestwidth_to_adjustwidth(guest_width);
1279 agaw = width_to_agaw(adjust_width);
1280 sagaw = cap_sagaw(iommu->cap);
1281 if (!test_bit(agaw, &sagaw)) {
1282 /* hardware doesn't support it, choose a bigger one */
1283 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1284 agaw = find_next_bit(&sagaw, 5, agaw);
1288 domain->agaw = agaw;
1289 INIT_LIST_HEAD(&domain->devices);
1291 /* always allocate the top pgd */
1292 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1295 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1299 static void domain_exit(struct dmar_domain *domain)
1303 /* Domain 0 is reserved, so dont process it */
1307 domain_remove_dev_info(domain);
1309 put_iova_domain(&domain->iovad);
1310 end = DOMAIN_MAX_ADDR(domain->gaw);
1311 end = end & (~PAGE_MASK);
1314 dma_pte_clear_range(domain, 0, end);
1316 /* free page tables */
1317 dma_pte_free_pagetable(domain, 0, end);
1319 iommu_free_domain(domain);
1320 free_domain_mem(domain);
1323 static int domain_context_mapping_one(struct dmar_domain *domain,
1326 struct context_entry *context;
1327 struct intel_iommu *iommu = domain->iommu;
1328 unsigned long flags;
1330 pr_debug("Set context mapping for %02x:%02x.%d\n",
1331 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1332 BUG_ON(!domain->pgd);
1333 context = device_to_context_entry(iommu, bus, devfn);
1336 spin_lock_irqsave(&iommu->lock, flags);
1337 if (context_present(context)) {
1338 spin_unlock_irqrestore(&iommu->lock, flags);
1342 context_set_domain_id(context, domain->id);
1343 context_set_address_width(context, domain->agaw);
1344 context_set_address_root(context, virt_to_phys(domain->pgd));
1345 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1346 context_set_fault_enable(context);
1347 context_set_present(context);
1348 __iommu_flush_cache(iommu, context, sizeof(*context));
1350 /* it's a non-present to present mapping */
1351 if (iommu->flush.flush_context(iommu, domain->id,
1352 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1353 DMA_CCMD_DEVICE_INVL, 1))
1354 iommu_flush_write_buffer(iommu);
1356 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1358 spin_unlock_irqrestore(&iommu->lock, flags);
1363 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1366 struct pci_dev *tmp, *parent;
1368 ret = domain_context_mapping_one(domain, pdev->bus->number,
1373 /* dependent device mapping */
1374 tmp = pci_find_upstream_pcie_bridge(pdev);
1377 /* Secondary interface's bus number and devfn 0 */
1378 parent = pdev->bus->self;
1379 while (parent != tmp) {
1380 ret = domain_context_mapping_one(domain, parent->bus->number,
1384 parent = parent->bus->self;
1386 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1387 return domain_context_mapping_one(domain,
1388 tmp->subordinate->number, 0);
1389 else /* this is a legacy PCI bridge */
1390 return domain_context_mapping_one(domain,
1391 tmp->bus->number, tmp->devfn);
1394 static int domain_context_mapped(struct dmar_domain *domain,
1395 struct pci_dev *pdev)
1398 struct pci_dev *tmp, *parent;
1400 ret = device_context_mapped(domain->iommu,
1401 pdev->bus->number, pdev->devfn);
1404 /* dependent device mapping */
1405 tmp = pci_find_upstream_pcie_bridge(pdev);
1408 /* Secondary interface's bus number and devfn 0 */
1409 parent = pdev->bus->self;
1410 while (parent != tmp) {
1411 ret = device_context_mapped(domain->iommu, parent->bus->number,
1415 parent = parent->bus->self;
1418 return device_context_mapped(domain->iommu,
1419 tmp->subordinate->number, 0);
1421 return device_context_mapped(domain->iommu,
1422 tmp->bus->number, tmp->devfn);
1426 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1427 u64 hpa, size_t size, int prot)
1429 u64 start_pfn, end_pfn;
1430 struct dma_pte *pte;
1432 int addr_width = agaw_to_width(domain->agaw);
1434 hpa &= (((u64)1) << addr_width) - 1;
1436 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1439 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1440 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1442 while (start_pfn < end_pfn) {
1443 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1446 /* We don't need lock here, nobody else
1447 * touches the iova range
1449 BUG_ON(dma_pte_addr(pte));
1450 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1451 dma_set_pte_prot(pte, prot);
1452 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1459 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1461 clear_context_table(domain->iommu, bus, devfn);
1462 domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1463 DMA_CCMD_GLOBAL_INVL, 0);
1464 domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1465 DMA_TLB_GLOBAL_FLUSH, 0);
1468 static void domain_remove_dev_info(struct dmar_domain *domain)
1470 struct device_domain_info *info;
1471 unsigned long flags;
1473 spin_lock_irqsave(&device_domain_lock, flags);
1474 while (!list_empty(&domain->devices)) {
1475 info = list_entry(domain->devices.next,
1476 struct device_domain_info, link);
1477 list_del(&info->link);
1478 list_del(&info->global);
1480 info->dev->dev.archdata.iommu = NULL;
1481 spin_unlock_irqrestore(&device_domain_lock, flags);
1483 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1484 free_devinfo_mem(info);
1486 spin_lock_irqsave(&device_domain_lock, flags);
1488 spin_unlock_irqrestore(&device_domain_lock, flags);
1493 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1495 static struct dmar_domain *
1496 find_domain(struct pci_dev *pdev)
1498 struct device_domain_info *info;
1500 /* No lock here, assumes no domain exit in normal case */
1501 info = pdev->dev.archdata.iommu;
1503 return info->domain;
1507 /* domain is initialized */
1508 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1510 struct dmar_domain *domain, *found = NULL;
1511 struct intel_iommu *iommu;
1512 struct dmar_drhd_unit *drhd;
1513 struct device_domain_info *info, *tmp;
1514 struct pci_dev *dev_tmp;
1515 unsigned long flags;
1516 int bus = 0, devfn = 0;
1518 domain = find_domain(pdev);
1522 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1524 if (dev_tmp->is_pcie) {
1525 bus = dev_tmp->subordinate->number;
1528 bus = dev_tmp->bus->number;
1529 devfn = dev_tmp->devfn;
1531 spin_lock_irqsave(&device_domain_lock, flags);
1532 list_for_each_entry(info, &device_domain_list, global) {
1533 if (info->bus == bus && info->devfn == devfn) {
1534 found = info->domain;
1538 spin_unlock_irqrestore(&device_domain_lock, flags);
1539 /* pcie-pci bridge already has a domain, uses it */
1546 /* Allocate new domain for the device */
1547 drhd = dmar_find_matched_drhd_unit(pdev);
1549 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1553 iommu = drhd->iommu;
1555 domain = iommu_alloc_domain(iommu);
1559 if (domain_init(domain, gaw)) {
1560 domain_exit(domain);
1564 /* register pcie-to-pci device */
1566 info = alloc_devinfo_mem();
1568 domain_exit(domain);
1572 info->devfn = devfn;
1574 info->domain = domain;
1575 /* This domain is shared by devices under p2p bridge */
1576 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1578 /* pcie-to-pci bridge already has a domain, uses it */
1580 spin_lock_irqsave(&device_domain_lock, flags);
1581 list_for_each_entry(tmp, &device_domain_list, global) {
1582 if (tmp->bus == bus && tmp->devfn == devfn) {
1583 found = tmp->domain;
1588 free_devinfo_mem(info);
1589 domain_exit(domain);
1592 list_add(&info->link, &domain->devices);
1593 list_add(&info->global, &device_domain_list);
1595 spin_unlock_irqrestore(&device_domain_lock, flags);
1599 info = alloc_devinfo_mem();
1602 info->bus = pdev->bus->number;
1603 info->devfn = pdev->devfn;
1605 info->domain = domain;
1606 spin_lock_irqsave(&device_domain_lock, flags);
1607 /* somebody is fast */
1608 found = find_domain(pdev);
1609 if (found != NULL) {
1610 spin_unlock_irqrestore(&device_domain_lock, flags);
1611 if (found != domain) {
1612 domain_exit(domain);
1615 free_devinfo_mem(info);
1618 list_add(&info->link, &domain->devices);
1619 list_add(&info->global, &device_domain_list);
1620 pdev->dev.archdata.iommu = info;
1621 spin_unlock_irqrestore(&device_domain_lock, flags);
1624 /* recheck it here, maybe others set it */
1625 return find_domain(pdev);
1628 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1629 unsigned long long start,
1630 unsigned long long end)
1632 struct dmar_domain *domain;
1634 unsigned long long base;
1638 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1639 pci_name(pdev), start, end);
1640 /* page table init */
1641 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1645 /* The address might not be aligned */
1646 base = start & PAGE_MASK;
1648 size = PAGE_ALIGN(size);
1649 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1650 IOVA_PFN(base + size) - 1)) {
1651 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1656 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1657 size, base, pci_name(pdev));
1659 * RMRR range might have overlap with physical memory range,
1662 dma_pte_clear_range(domain, base, base + size);
1664 ret = domain_page_mapping(domain, base, base, size,
1665 DMA_PTE_READ|DMA_PTE_WRITE);
1669 /* context entry init */
1670 ret = domain_context_mapping(domain, pdev);
1674 domain_exit(domain);
1679 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1680 struct pci_dev *pdev)
1682 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1684 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1685 rmrr->end_address + 1);
1688 #ifdef CONFIG_DMAR_GFX_WA
1689 struct iommu_prepare_data {
1690 struct pci_dev *pdev;
1694 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1695 unsigned long end_pfn, void *datax)
1697 struct iommu_prepare_data *data;
1699 data = (struct iommu_prepare_data *)datax;
1701 data->ret = iommu_prepare_identity_map(data->pdev,
1702 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1707 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1710 struct iommu_prepare_data data;
1715 for_each_online_node(nid) {
1716 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1723 static void __init iommu_prepare_gfx_mapping(void)
1725 struct pci_dev *pdev = NULL;
1728 for_each_pci_dev(pdev) {
1729 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1730 !IS_GFX_DEVICE(pdev))
1732 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1734 ret = iommu_prepare_with_active_regions(pdev);
1736 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1739 #else /* !CONFIG_DMAR_GFX_WA */
1740 static inline void iommu_prepare_gfx_mapping(void)
1746 #ifdef CONFIG_DMAR_FLOPPY_WA
1747 static inline void iommu_prepare_isa(void)
1749 struct pci_dev *pdev;
1752 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1756 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1757 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1760 printk("IOMMU: Failed to create 0-64M identity map, "
1761 "floppy might not work\n");
1765 static inline void iommu_prepare_isa(void)
1769 #endif /* !CONFIG_DMAR_FLPY_WA */
1771 static int __init init_dmars(void)
1773 struct dmar_drhd_unit *drhd;
1774 struct dmar_rmrr_unit *rmrr;
1775 struct pci_dev *pdev;
1776 struct intel_iommu *iommu;
1777 int i, ret, unit = 0;
1782 * initialize and program root entry to not present
1785 for_each_drhd_unit(drhd) {
1788 * lock not needed as this is only incremented in the single
1789 * threaded kernel __init code path all other access are read
1794 deferred_flush = kzalloc(g_num_of_iommus *
1795 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1796 if (!deferred_flush) {
1801 for_each_drhd_unit(drhd) {
1805 iommu = drhd->iommu;
1807 ret = iommu_init_domains(iommu);
1813 * we could share the same root & context tables
1814 * amoung all IOMMU's. Need to Split it later.
1816 ret = iommu_alloc_root_entry(iommu);
1818 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1823 for_each_drhd_unit(drhd) {
1827 iommu = drhd->iommu;
1828 if (dmar_enable_qi(iommu)) {
1830 * Queued Invalidate not enabled, use Register Based
1833 iommu->flush.flush_context = __iommu_flush_context;
1834 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1835 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1837 (unsigned long long)drhd->reg_base_addr);
1839 iommu->flush.flush_context = qi_flush_context;
1840 iommu->flush.flush_iotlb = qi_flush_iotlb;
1841 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1843 (unsigned long long)drhd->reg_base_addr);
1849 * for each dev attached to rmrr
1851 * locate drhd for dev, alloc domain for dev
1852 * allocate free domain
1853 * allocate page table entries for rmrr
1854 * if context not allocated for bus
1855 * allocate and init context
1856 * set present in root table for this bus
1857 * init context with domain, translation etc
1861 for_each_rmrr_units(rmrr) {
1862 for (i = 0; i < rmrr->devices_cnt; i++) {
1863 pdev = rmrr->devices[i];
1864 /* some BIOS lists non-exist devices in DMAR table */
1867 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1870 "IOMMU: mapping reserved region failed\n");
1874 iommu_prepare_gfx_mapping();
1876 iommu_prepare_isa();
1881 * global invalidate context cache
1882 * global invalidate iotlb
1883 * enable translation
1885 for_each_drhd_unit(drhd) {
1888 iommu = drhd->iommu;
1889 sprintf (iommu->name, "dmar%d", unit++);
1891 iommu_flush_write_buffer(iommu);
1893 ret = dmar_set_interrupt(iommu);
1897 iommu_set_root_entry(iommu);
1899 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1901 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1903 iommu_disable_protect_mem_regions(iommu);
1905 ret = iommu_enable_translation(iommu);
1912 for_each_drhd_unit(drhd) {
1915 iommu = drhd->iommu;
1921 static inline u64 aligned_size(u64 host_addr, size_t size)
1924 addr = (host_addr & (~PAGE_MASK)) + size;
1925 return PAGE_ALIGN(addr);
1929 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1933 /* Make sure it's in range */
1934 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1935 if (!size || (IOVA_START_ADDR + size > end))
1938 piova = alloc_iova(&domain->iovad,
1939 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1943 static struct iova *
1944 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1945 size_t size, u64 dma_mask)
1947 struct pci_dev *pdev = to_pci_dev(dev);
1948 struct iova *iova = NULL;
1950 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1951 iova = iommu_alloc_iova(domain, size, dma_mask);
1954 * First try to allocate an io virtual address in
1955 * DMA_32BIT_MASK and if that fails then try allocating
1958 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1960 iova = iommu_alloc_iova(domain, size, dma_mask);
1964 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1971 static struct dmar_domain *
1972 get_valid_domain_for_dev(struct pci_dev *pdev)
1974 struct dmar_domain *domain;
1977 domain = get_domain_for_dev(pdev,
1978 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1981 "Allocating domain for %s failed", pci_name(pdev));
1985 /* make sure context mapping is ok */
1986 if (unlikely(!domain_context_mapped(domain, pdev))) {
1987 ret = domain_context_mapping(domain, pdev);
1990 "Domain context map for %s failed",
1999 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2000 size_t size, int dir, u64 dma_mask)
2002 struct pci_dev *pdev = to_pci_dev(hwdev);
2003 struct dmar_domain *domain;
2004 phys_addr_t start_paddr;
2009 BUG_ON(dir == DMA_NONE);
2010 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2013 domain = get_valid_domain_for_dev(pdev);
2017 size = aligned_size((u64)paddr, size);
2019 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2023 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2026 * Check if DMAR supports zero-length reads on write only
2029 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2030 !cap_zlr(domain->iommu->cap))
2031 prot |= DMA_PTE_READ;
2032 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2033 prot |= DMA_PTE_WRITE;
2035 * paddr - (paddr + size) might be partial page, we should map the whole
2036 * page. Note: if two part of one page are separately mapped, we
2037 * might have two guest_addr mapping to the same host paddr, but this
2038 * is not a big problem
2040 ret = domain_page_mapping(domain, start_paddr,
2041 ((u64)paddr) & PAGE_MASK, size, prot);
2045 /* it's a non-present to present mapping */
2046 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
2047 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2049 iommu_flush_write_buffer(domain->iommu);
2051 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2055 __free_iova(&domain->iovad, iova);
2056 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2057 pci_name(pdev), size, (unsigned long long)paddr, dir);
2061 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2062 size_t size, int dir)
2064 return __intel_map_single(hwdev, paddr, size, dir,
2065 to_pci_dev(hwdev)->dma_mask);
2068 static void flush_unmaps(void)
2074 /* just flush them all */
2075 for (i = 0; i < g_num_of_iommus; i++) {
2076 if (deferred_flush[i].next) {
2077 struct intel_iommu *iommu =
2078 deferred_flush[i].domain[0]->iommu;
2080 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2081 DMA_TLB_GLOBAL_FLUSH, 0);
2082 for (j = 0; j < deferred_flush[i].next; j++) {
2083 __free_iova(&deferred_flush[i].domain[j]->iovad,
2084 deferred_flush[i].iova[j]);
2086 deferred_flush[i].next = 0;
2093 static void flush_unmaps_timeout(unsigned long data)
2095 unsigned long flags;
2097 spin_lock_irqsave(&async_umap_flush_lock, flags);
2099 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2102 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2104 unsigned long flags;
2107 spin_lock_irqsave(&async_umap_flush_lock, flags);
2108 if (list_size == HIGH_WATER_MARK)
2111 iommu_id = dom->iommu->seq_id;
2113 next = deferred_flush[iommu_id].next;
2114 deferred_flush[iommu_id].domain[next] = dom;
2115 deferred_flush[iommu_id].iova[next] = iova;
2116 deferred_flush[iommu_id].next++;
2119 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2123 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2126 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2129 struct pci_dev *pdev = to_pci_dev(dev);
2130 struct dmar_domain *domain;
2131 unsigned long start_addr;
2134 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2136 domain = find_domain(pdev);
2139 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2143 start_addr = iova->pfn_lo << PAGE_SHIFT;
2144 size = aligned_size((u64)dev_addr, size);
2146 pr_debug("Device %s unmapping: %lx@%llx\n",
2147 pci_name(pdev), size, (unsigned long long)start_addr);
2149 /* clear the whole page */
2150 dma_pte_clear_range(domain, start_addr, start_addr + size);
2151 /* free page tables */
2152 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2153 if (intel_iommu_strict) {
2154 if (iommu_flush_iotlb_psi(domain->iommu,
2155 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2156 iommu_flush_write_buffer(domain->iommu);
2158 __free_iova(&domain->iovad, iova);
2160 add_unmap(domain, iova);
2162 * queue up the release of the unmap to save the 1/6th of the
2163 * cpu used up by the iotlb flush operation...
2168 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2169 dma_addr_t *dma_handle, gfp_t flags)
2174 size = PAGE_ALIGN(size);
2175 order = get_order(size);
2176 flags &= ~(GFP_DMA | GFP_DMA32);
2178 vaddr = (void *)__get_free_pages(flags, order);
2181 memset(vaddr, 0, size);
2183 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2185 hwdev->coherent_dma_mask);
2188 free_pages((unsigned long)vaddr, order);
2192 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2193 dma_addr_t dma_handle)
2197 size = PAGE_ALIGN(size);
2198 order = get_order(size);
2200 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2201 free_pages((unsigned long)vaddr, order);
2204 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2206 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2207 int nelems, int dir)
2210 struct pci_dev *pdev = to_pci_dev(hwdev);
2211 struct dmar_domain *domain;
2212 unsigned long start_addr;
2216 struct scatterlist *sg;
2218 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2221 domain = find_domain(pdev);
2223 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2226 for_each_sg(sglist, sg, nelems, i) {
2227 addr = SG_ENT_VIRT_ADDRESS(sg);
2228 size += aligned_size((u64)addr, sg->length);
2231 start_addr = iova->pfn_lo << PAGE_SHIFT;
2233 /* clear the whole page */
2234 dma_pte_clear_range(domain, start_addr, start_addr + size);
2235 /* free page tables */
2236 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2238 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2239 size >> VTD_PAGE_SHIFT, 0))
2240 iommu_flush_write_buffer(domain->iommu);
2243 __free_iova(&domain->iovad, iova);
2246 static int intel_nontranslate_map_sg(struct device *hddev,
2247 struct scatterlist *sglist, int nelems, int dir)
2250 struct scatterlist *sg;
2252 for_each_sg(sglist, sg, nelems, i) {
2253 BUG_ON(!sg_page(sg));
2254 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2255 sg->dma_length = sg->length;
2260 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2265 struct pci_dev *pdev = to_pci_dev(hwdev);
2266 struct dmar_domain *domain;
2270 struct iova *iova = NULL;
2272 struct scatterlist *sg;
2273 unsigned long start_addr;
2275 BUG_ON(dir == DMA_NONE);
2276 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2277 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2279 domain = get_valid_domain_for_dev(pdev);
2283 for_each_sg(sglist, sg, nelems, i) {
2284 addr = SG_ENT_VIRT_ADDRESS(sg);
2285 addr = (void *)virt_to_phys(addr);
2286 size += aligned_size((u64)addr, sg->length);
2289 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2291 sglist->dma_length = 0;
2296 * Check if DMAR supports zero-length reads on write only
2299 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2300 !cap_zlr(domain->iommu->cap))
2301 prot |= DMA_PTE_READ;
2302 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2303 prot |= DMA_PTE_WRITE;
2305 start_addr = iova->pfn_lo << PAGE_SHIFT;
2307 for_each_sg(sglist, sg, nelems, i) {
2308 addr = SG_ENT_VIRT_ADDRESS(sg);
2309 addr = (void *)virt_to_phys(addr);
2310 size = aligned_size((u64)addr, sg->length);
2311 ret = domain_page_mapping(domain, start_addr + offset,
2312 ((u64)addr) & PAGE_MASK,
2315 /* clear the page */
2316 dma_pte_clear_range(domain, start_addr,
2317 start_addr + offset);
2318 /* free page tables */
2319 dma_pte_free_pagetable(domain, start_addr,
2320 start_addr + offset);
2322 __free_iova(&domain->iovad, iova);
2325 sg->dma_address = start_addr + offset +
2326 ((u64)addr & (~PAGE_MASK));
2327 sg->dma_length = sg->length;
2331 /* it's a non-present to present mapping */
2332 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2333 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2334 iommu_flush_write_buffer(domain->iommu);
2338 static struct dma_mapping_ops intel_dma_ops = {
2339 .alloc_coherent = intel_alloc_coherent,
2340 .free_coherent = intel_free_coherent,
2341 .map_single = intel_map_single,
2342 .unmap_single = intel_unmap_single,
2343 .map_sg = intel_map_sg,
2344 .unmap_sg = intel_unmap_sg,
2347 static inline int iommu_domain_cache_init(void)
2351 iommu_domain_cache = kmem_cache_create("iommu_domain",
2352 sizeof(struct dmar_domain),
2357 if (!iommu_domain_cache) {
2358 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2365 static inline int iommu_devinfo_cache_init(void)
2369 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2370 sizeof(struct device_domain_info),
2374 if (!iommu_devinfo_cache) {
2375 printk(KERN_ERR "Couldn't create devinfo cache\n");
2382 static inline int iommu_iova_cache_init(void)
2386 iommu_iova_cache = kmem_cache_create("iommu_iova",
2387 sizeof(struct iova),
2391 if (!iommu_iova_cache) {
2392 printk(KERN_ERR "Couldn't create iova cache\n");
2399 static int __init iommu_init_mempool(void)
2402 ret = iommu_iova_cache_init();
2406 ret = iommu_domain_cache_init();
2410 ret = iommu_devinfo_cache_init();
2414 kmem_cache_destroy(iommu_domain_cache);
2416 kmem_cache_destroy(iommu_iova_cache);
2421 static void __init iommu_exit_mempool(void)
2423 kmem_cache_destroy(iommu_devinfo_cache);
2424 kmem_cache_destroy(iommu_domain_cache);
2425 kmem_cache_destroy(iommu_iova_cache);
2429 static void __init init_no_remapping_devices(void)
2431 struct dmar_drhd_unit *drhd;
2433 for_each_drhd_unit(drhd) {
2434 if (!drhd->include_all) {
2436 for (i = 0; i < drhd->devices_cnt; i++)
2437 if (drhd->devices[i] != NULL)
2439 /* ignore DMAR unit if no pci devices exist */
2440 if (i == drhd->devices_cnt)
2448 for_each_drhd_unit(drhd) {
2450 if (drhd->ignored || drhd->include_all)
2453 for (i = 0; i < drhd->devices_cnt; i++)
2454 if (drhd->devices[i] &&
2455 !IS_GFX_DEVICE(drhd->devices[i]))
2458 if (i < drhd->devices_cnt)
2461 /* bypass IOMMU if it is just for gfx devices */
2463 for (i = 0; i < drhd->devices_cnt; i++) {
2464 if (!drhd->devices[i])
2466 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2471 int __init intel_iommu_init(void)
2475 if (dmar_table_init())
2478 if (dmar_dev_scope_init())
2482 * Check the need for DMA-remapping initialization now.
2483 * Above initialization will also be used by Interrupt-remapping.
2485 if (no_iommu || swiotlb || dmar_disabled)
2488 iommu_init_mempool();
2489 dmar_init_reserved_ranges();
2491 init_no_remapping_devices();
2495 printk(KERN_ERR "IOMMU: dmar init failed\n");
2496 put_iova_domain(&reserved_iova_list);
2497 iommu_exit_mempool();
2501 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2503 init_timer(&unmap_timer);
2505 dma_ops = &intel_dma_ops;
2509 void intel_iommu_domain_exit(struct dmar_domain *domain)
2513 /* Domain 0 is reserved, so dont process it */
2517 end = DOMAIN_MAX_ADDR(domain->gaw);
2518 end = end & (~VTD_PAGE_MASK);
2521 dma_pte_clear_range(domain, 0, end);
2523 /* free page tables */
2524 dma_pte_free_pagetable(domain, 0, end);
2526 iommu_free_domain(domain);
2527 free_domain_mem(domain);
2529 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2531 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2533 struct dmar_drhd_unit *drhd;
2534 struct dmar_domain *domain;
2535 struct intel_iommu *iommu;
2537 drhd = dmar_find_matched_drhd_unit(pdev);
2539 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2543 iommu = drhd->iommu;
2546 "intel_iommu_domain_alloc: iommu == NULL\n");
2549 domain = iommu_alloc_domain(iommu);
2552 "intel_iommu_domain_alloc: domain == NULL\n");
2555 if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2557 "intel_iommu_domain_alloc: domain_init() failed\n");
2558 intel_iommu_domain_exit(domain);
2563 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2565 int intel_iommu_context_mapping(
2566 struct dmar_domain *domain, struct pci_dev *pdev)
2569 rc = domain_context_mapping(domain, pdev);
2572 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2574 int intel_iommu_page_mapping(
2575 struct dmar_domain *domain, dma_addr_t iova,
2576 u64 hpa, size_t size, int prot)
2579 rc = domain_page_mapping(domain, iova, hpa, size, prot);
2582 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2584 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2586 detach_domain_for_dev(domain, bus, devfn);
2588 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2590 struct dmar_domain *
2591 intel_iommu_find_domain(struct pci_dev *pdev)
2593 return find_domain(pdev);
2595 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2597 int intel_iommu_found(void)
2599 return g_num_of_iommus;
2601 EXPORT_SYMBOL_GPL(intel_iommu_found);
2603 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2605 struct dma_pte *pte;
2609 pte = addr_to_dma_pte(domain, iova);
2612 pfn = dma_pte_addr(pte);
2614 return pfn >> VTD_PAGE_SHIFT;
2616 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);