2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
42 #define ROOT_SIZE VTD_PAGE_SIZE
43 #define CONTEXT_SIZE VTD_PAGE_SIZE
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48 #define IOAPIC_RANGE_START (0xfee00000)
49 #define IOAPIC_RANGE_END (0xfeefffff)
50 #define IOVA_START_ADDR (0x1000)
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
63 * 12-63: Context Ptr (12 - (haw-1))
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
73 return (root->val & 1);
75 static inline void set_root_present(struct root_entry *root)
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
81 root->val |= value & VTD_PAGE_MASK;
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
87 return (struct context_entry *)
88 (root_present(root)?phys_to_virt(
89 root->val & VTD_PAGE_MASK) :
96 * 1: fault processing disable
97 * 2-3: translation type
98 * 12-63: address space root
104 struct context_entry {
108 #define context_present(c) ((c).lo & 1)
109 #define context_fault_disable(c) (((c).lo >> 1) & 1)
110 #define context_translation_type(c) (((c).lo >> 2) & 3)
111 #define context_address_root(c) ((c).lo & VTD_PAGE_MASK)
112 #define context_address_width(c) ((c).hi & 7)
113 #define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
115 #define context_set_present(c) do {(c).lo |= 1;} while (0)
116 #define context_set_fault_enable(c) \
117 do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
118 #define context_set_translation_type(c, val) \
120 (c).lo &= (((u64)-1) << 4) | 3; \
121 (c).lo |= ((val) & 3) << 2; \
123 #define CONTEXT_TT_MULTI_LEVEL 0
124 #define context_set_address_root(c, val) \
125 do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0)
126 #define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
127 #define context_set_domain_id(c, val) \
128 do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
129 #define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
137 * 12-63: Host physcial address
142 #define dma_clear_pte(p) do {(p).val = 0;} while (0)
144 #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
145 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
146 #define dma_set_pte_prot(p, prot) \
147 do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
148 #define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK)
149 #define dma_set_pte_addr(p, addr) do {\
150 (p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
151 #define dma_pte_present(p) (((p).val & 3) != 0)
153 static void flush_unmaps_timeout(unsigned long data);
155 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
157 #define HIGH_WATER_MARK 250
158 struct deferred_flush_tables {
160 struct iova *iova[HIGH_WATER_MARK];
161 struct dmar_domain *domain[HIGH_WATER_MARK];
164 static struct deferred_flush_tables *deferred_flush;
166 /* bitmap for indexing intel_iommus */
167 static int g_num_of_iommus;
169 static DEFINE_SPINLOCK(async_umap_flush_lock);
170 static LIST_HEAD(unmaps_to_do);
173 static long list_size;
175 static void domain_remove_dev_info(struct dmar_domain *domain);
178 static int __initdata dmar_map_gfx = 1;
179 static int dmar_forcedac;
180 static int intel_iommu_strict;
182 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
183 static DEFINE_SPINLOCK(device_domain_lock);
184 static LIST_HEAD(device_domain_list);
186 static int __init intel_iommu_setup(char *str)
191 if (!strncmp(str, "off", 3)) {
193 printk(KERN_INFO"Intel-IOMMU: disabled\n");
194 } else if (!strncmp(str, "igfx_off", 8)) {
197 "Intel-IOMMU: disable GFX device mapping\n");
198 } else if (!strncmp(str, "forcedac", 8)) {
200 "Intel-IOMMU: Forcing DAC for PCI devices\n");
202 } else if (!strncmp(str, "strict", 6)) {
204 "Intel-IOMMU: disable batched IOTLB flush\n");
205 intel_iommu_strict = 1;
208 str += strcspn(str, ",");
214 __setup("intel_iommu=", intel_iommu_setup);
216 static struct kmem_cache *iommu_domain_cache;
217 static struct kmem_cache *iommu_devinfo_cache;
218 static struct kmem_cache *iommu_iova_cache;
220 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
225 /* trying to avoid low memory issues */
226 flags = current->flags & PF_MEMALLOC;
227 current->flags |= PF_MEMALLOC;
228 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
229 current->flags &= (~PF_MEMALLOC | flags);
234 static inline void *alloc_pgtable_page(void)
239 /* trying to avoid low memory issues */
240 flags = current->flags & PF_MEMALLOC;
241 current->flags |= PF_MEMALLOC;
242 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
243 current->flags &= (~PF_MEMALLOC | flags);
247 static inline void free_pgtable_page(void *vaddr)
249 free_page((unsigned long)vaddr);
252 static inline void *alloc_domain_mem(void)
254 return iommu_kmem_cache_alloc(iommu_domain_cache);
257 static void free_domain_mem(void *vaddr)
259 kmem_cache_free(iommu_domain_cache, vaddr);
262 static inline void * alloc_devinfo_mem(void)
264 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
267 static inline void free_devinfo_mem(void *vaddr)
269 kmem_cache_free(iommu_devinfo_cache, vaddr);
272 struct iova *alloc_iova_mem(void)
274 return iommu_kmem_cache_alloc(iommu_iova_cache);
277 void free_iova_mem(struct iova *iova)
279 kmem_cache_free(iommu_iova_cache, iova);
282 /* Gets context entry for a given bus and devfn */
283 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
286 struct root_entry *root;
287 struct context_entry *context;
288 unsigned long phy_addr;
291 spin_lock_irqsave(&iommu->lock, flags);
292 root = &iommu->root_entry[bus];
293 context = get_context_addr_from_root(root);
295 context = (struct context_entry *)alloc_pgtable_page();
297 spin_unlock_irqrestore(&iommu->lock, flags);
300 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
301 phy_addr = virt_to_phys((void *)context);
302 set_root_value(root, phy_addr);
303 set_root_present(root);
304 __iommu_flush_cache(iommu, root, sizeof(*root));
306 spin_unlock_irqrestore(&iommu->lock, flags);
307 return &context[devfn];
310 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
312 struct root_entry *root;
313 struct context_entry *context;
317 spin_lock_irqsave(&iommu->lock, flags);
318 root = &iommu->root_entry[bus];
319 context = get_context_addr_from_root(root);
324 ret = context_present(context[devfn]);
326 spin_unlock_irqrestore(&iommu->lock, flags);
330 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
332 struct root_entry *root;
333 struct context_entry *context;
336 spin_lock_irqsave(&iommu->lock, flags);
337 root = &iommu->root_entry[bus];
338 context = get_context_addr_from_root(root);
340 context_clear_entry(context[devfn]);
341 __iommu_flush_cache(iommu, &context[devfn], \
344 spin_unlock_irqrestore(&iommu->lock, flags);
347 static void free_context_table(struct intel_iommu *iommu)
349 struct root_entry *root;
352 struct context_entry *context;
354 spin_lock_irqsave(&iommu->lock, flags);
355 if (!iommu->root_entry) {
358 for (i = 0; i < ROOT_ENTRY_NR; i++) {
359 root = &iommu->root_entry[i];
360 context = get_context_addr_from_root(root);
362 free_pgtable_page(context);
364 free_pgtable_page(iommu->root_entry);
365 iommu->root_entry = NULL;
367 spin_unlock_irqrestore(&iommu->lock, flags);
370 /* page table handling */
371 #define LEVEL_STRIDE (9)
372 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
374 static inline int agaw_to_level(int agaw)
379 static inline int agaw_to_width(int agaw)
381 return 30 + agaw * LEVEL_STRIDE;
385 static inline int width_to_agaw(int width)
387 return (width - 30) / LEVEL_STRIDE;
390 static inline unsigned int level_to_offset_bits(int level)
392 return (12 + (level - 1) * LEVEL_STRIDE);
395 static inline int address_level_offset(u64 addr, int level)
397 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
400 static inline u64 level_mask(int level)
402 return ((u64)-1 << level_to_offset_bits(level));
405 static inline u64 level_size(int level)
407 return ((u64)1 << level_to_offset_bits(level));
410 static inline u64 align_to_level(u64 addr, int level)
412 return ((addr + level_size(level) - 1) & level_mask(level));
415 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
417 int addr_width = agaw_to_width(domain->agaw);
418 struct dma_pte *parent, *pte = NULL;
419 int level = agaw_to_level(domain->agaw);
423 BUG_ON(!domain->pgd);
425 addr &= (((u64)1) << addr_width) - 1;
426 parent = domain->pgd;
428 spin_lock_irqsave(&domain->mapping_lock, flags);
432 offset = address_level_offset(addr, level);
433 pte = &parent[offset];
437 if (!dma_pte_present(*pte)) {
438 tmp_page = alloc_pgtable_page();
441 spin_unlock_irqrestore(&domain->mapping_lock,
445 __iommu_flush_cache(domain->iommu, tmp_page,
447 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
449 * high level table always sets r/w, last level page
450 * table control read/write
452 dma_set_pte_readable(*pte);
453 dma_set_pte_writable(*pte);
454 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
456 parent = phys_to_virt(dma_pte_addr(*pte));
460 spin_unlock_irqrestore(&domain->mapping_lock, flags);
464 /* return address's pte at specific level */
465 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
468 struct dma_pte *parent, *pte = NULL;
469 int total = agaw_to_level(domain->agaw);
472 parent = domain->pgd;
473 while (level <= total) {
474 offset = address_level_offset(addr, total);
475 pte = &parent[offset];
479 if (!dma_pte_present(*pte))
481 parent = phys_to_virt(dma_pte_addr(*pte));
487 /* clear one page's page table */
488 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
490 struct dma_pte *pte = NULL;
492 /* get last level pte */
493 pte = dma_addr_level_pte(domain, addr, 1);
497 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
501 /* clear last level pte, a tlb flush should be followed */
502 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
504 int addr_width = agaw_to_width(domain->agaw);
506 start &= (((u64)1) << addr_width) - 1;
507 end &= (((u64)1) << addr_width) - 1;
508 /* in case it's partial page */
509 start = PAGE_ALIGN(start);
512 /* we don't need lock here, nobody else touches the iova range */
513 while (start < end) {
514 dma_pte_clear_one(domain, start);
515 start += VTD_PAGE_SIZE;
519 /* free page table pages. last level pte should already be cleared */
520 static void dma_pte_free_pagetable(struct dmar_domain *domain,
523 int addr_width = agaw_to_width(domain->agaw);
525 int total = agaw_to_level(domain->agaw);
529 start &= (((u64)1) << addr_width) - 1;
530 end &= (((u64)1) << addr_width) - 1;
532 /* we don't need lock here, nobody else touches the iova range */
534 while (level <= total) {
535 tmp = align_to_level(start, level);
536 if (tmp >= end || (tmp + level_size(level) > end))
540 pte = dma_addr_level_pte(domain, tmp, level);
543 phys_to_virt(dma_pte_addr(*pte)));
545 __iommu_flush_cache(domain->iommu,
548 tmp += level_size(level);
553 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
554 free_pgtable_page(domain->pgd);
560 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
562 struct root_entry *root;
565 root = (struct root_entry *)alloc_pgtable_page();
569 __iommu_flush_cache(iommu, root, ROOT_SIZE);
571 spin_lock_irqsave(&iommu->lock, flags);
572 iommu->root_entry = root;
573 spin_unlock_irqrestore(&iommu->lock, flags);
578 static void iommu_set_root_entry(struct intel_iommu *iommu)
584 addr = iommu->root_entry;
586 spin_lock_irqsave(&iommu->register_lock, flag);
587 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
589 cmd = iommu->gcmd | DMA_GCMD_SRTP;
590 writel(cmd, iommu->reg + DMAR_GCMD_REG);
592 /* Make sure hardware complete it */
593 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
594 readl, (sts & DMA_GSTS_RTPS), sts);
596 spin_unlock_irqrestore(&iommu->register_lock, flag);
599 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
604 if (!cap_rwbf(iommu->cap))
606 val = iommu->gcmd | DMA_GCMD_WBF;
608 spin_lock_irqsave(&iommu->register_lock, flag);
609 writel(val, iommu->reg + DMAR_GCMD_REG);
611 /* Make sure hardware complete it */
612 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
613 readl, (!(val & DMA_GSTS_WBFS)), val);
615 spin_unlock_irqrestore(&iommu->register_lock, flag);
618 /* return value determine if we need a write buffer flush */
619 static int __iommu_flush_context(struct intel_iommu *iommu,
620 u16 did, u16 source_id, u8 function_mask, u64 type,
621 int non_present_entry_flush)
627 * In the non-present entry flush case, if hardware doesn't cache
628 * non-present entry we do nothing and if hardware cache non-present
629 * entry, we flush entries of domain 0 (the domain id is used to cache
630 * any non-present entries)
632 if (non_present_entry_flush) {
633 if (!cap_caching_mode(iommu->cap))
640 case DMA_CCMD_GLOBAL_INVL:
641 val = DMA_CCMD_GLOBAL_INVL;
643 case DMA_CCMD_DOMAIN_INVL:
644 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
646 case DMA_CCMD_DEVICE_INVL:
647 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
648 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
655 spin_lock_irqsave(&iommu->register_lock, flag);
656 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
658 /* Make sure hardware complete it */
659 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
660 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
662 spin_unlock_irqrestore(&iommu->register_lock, flag);
664 /* flush context entry will implicitly flush write buffer */
668 /* return value determine if we need a write buffer flush */
669 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
670 u64 addr, unsigned int size_order, u64 type,
671 int non_present_entry_flush)
673 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
674 u64 val = 0, val_iva = 0;
678 * In the non-present entry flush case, if hardware doesn't cache
679 * non-present entry we do nothing and if hardware cache non-present
680 * entry, we flush entries of domain 0 (the domain id is used to cache
681 * any non-present entries)
683 if (non_present_entry_flush) {
684 if (!cap_caching_mode(iommu->cap))
691 case DMA_TLB_GLOBAL_FLUSH:
692 /* global flush doesn't need set IVA_REG */
693 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
695 case DMA_TLB_DSI_FLUSH:
696 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
698 case DMA_TLB_PSI_FLUSH:
699 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
700 /* Note: always flush non-leaf currently */
701 val_iva = size_order | addr;
706 /* Note: set drain read/write */
709 * This is probably to be super secure.. Looks like we can
710 * ignore it without any impact.
712 if (cap_read_drain(iommu->cap))
713 val |= DMA_TLB_READ_DRAIN;
715 if (cap_write_drain(iommu->cap))
716 val |= DMA_TLB_WRITE_DRAIN;
718 spin_lock_irqsave(&iommu->register_lock, flag);
719 /* Note: Only uses first TLB reg currently */
721 dmar_writeq(iommu->reg + tlb_offset, val_iva);
722 dmar_writeq(iommu->reg + tlb_offset + 8, val);
724 /* Make sure hardware complete it */
725 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
726 dmar_readq, (!(val & DMA_TLB_IVT)), val);
728 spin_unlock_irqrestore(&iommu->register_lock, flag);
730 /* check IOTLB invalidation granularity */
731 if (DMA_TLB_IAIG(val) == 0)
732 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
733 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
734 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
735 (unsigned long long)DMA_TLB_IIRG(type),
736 (unsigned long long)DMA_TLB_IAIG(val));
737 /* flush iotlb entry will implicitly flush write buffer */
741 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
742 u64 addr, unsigned int pages, int non_present_entry_flush)
746 BUG_ON(addr & (~VTD_PAGE_MASK));
749 /* Fallback to domain selective flush if no PSI support */
750 if (!cap_pgsel_inv(iommu->cap))
751 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
753 non_present_entry_flush);
756 * PSI requires page size to be 2 ^ x, and the base address is naturally
757 * aligned to the size
759 mask = ilog2(__roundup_pow_of_two(pages));
760 /* Fallback to domain selective flush if size is too big */
761 if (mask > cap_max_amask_val(iommu->cap))
762 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
763 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
765 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
767 non_present_entry_flush);
770 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
775 spin_lock_irqsave(&iommu->register_lock, flags);
776 pmen = readl(iommu->reg + DMAR_PMEN_REG);
777 pmen &= ~DMA_PMEN_EPM;
778 writel(pmen, iommu->reg + DMAR_PMEN_REG);
780 /* wait for the protected region status bit to clear */
781 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
782 readl, !(pmen & DMA_PMEN_PRS), pmen);
784 spin_unlock_irqrestore(&iommu->register_lock, flags);
787 static int iommu_enable_translation(struct intel_iommu *iommu)
792 spin_lock_irqsave(&iommu->register_lock, flags);
793 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
795 /* Make sure hardware complete it */
796 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
797 readl, (sts & DMA_GSTS_TES), sts);
799 iommu->gcmd |= DMA_GCMD_TE;
800 spin_unlock_irqrestore(&iommu->register_lock, flags);
804 static int iommu_disable_translation(struct intel_iommu *iommu)
809 spin_lock_irqsave(&iommu->register_lock, flag);
810 iommu->gcmd &= ~DMA_GCMD_TE;
811 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
813 /* Make sure hardware complete it */
814 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
815 readl, (!(sts & DMA_GSTS_TES)), sts);
817 spin_unlock_irqrestore(&iommu->register_lock, flag);
821 /* iommu interrupt handling. Most stuff are MSI-like. */
823 static const char *fault_reason_strings[] =
826 "Present bit in root entry is clear",
827 "Present bit in context entry is clear",
828 "Invalid context entry",
829 "Access beyond MGAW",
830 "PTE Write access is not set",
831 "PTE Read access is not set",
832 "Next page table ptr is invalid",
833 "Root table address invalid",
834 "Context table ptr is invalid",
835 "non-zero reserved fields in RTP",
836 "non-zero reserved fields in CTP",
837 "non-zero reserved fields in PTE",
839 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
841 const char *dmar_get_fault_reason(u8 fault_reason)
843 if (fault_reason > MAX_FAULT_REASON_IDX)
846 return fault_reason_strings[fault_reason];
849 void dmar_msi_unmask(unsigned int irq)
851 struct intel_iommu *iommu = get_irq_data(irq);
855 spin_lock_irqsave(&iommu->register_lock, flag);
856 writel(0, iommu->reg + DMAR_FECTL_REG);
857 /* Read a reg to force flush the post write */
858 readl(iommu->reg + DMAR_FECTL_REG);
859 spin_unlock_irqrestore(&iommu->register_lock, flag);
862 void dmar_msi_mask(unsigned int irq)
865 struct intel_iommu *iommu = get_irq_data(irq);
868 spin_lock_irqsave(&iommu->register_lock, flag);
869 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
870 /* Read a reg to force flush the post write */
871 readl(iommu->reg + DMAR_FECTL_REG);
872 spin_unlock_irqrestore(&iommu->register_lock, flag);
875 void dmar_msi_write(int irq, struct msi_msg *msg)
877 struct intel_iommu *iommu = get_irq_data(irq);
880 spin_lock_irqsave(&iommu->register_lock, flag);
881 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
882 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
883 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
884 spin_unlock_irqrestore(&iommu->register_lock, flag);
887 void dmar_msi_read(int irq, struct msi_msg *msg)
889 struct intel_iommu *iommu = get_irq_data(irq);
892 spin_lock_irqsave(&iommu->register_lock, flag);
893 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
894 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
895 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
896 spin_unlock_irqrestore(&iommu->register_lock, flag);
899 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
900 u8 fault_reason, u16 source_id, unsigned long long addr)
904 reason = dmar_get_fault_reason(fault_reason);
907 "DMAR:[%s] Request device [%02x:%02x.%d] "
909 "DMAR:[fault reason %02d] %s\n",
910 (type ? "DMA Read" : "DMA Write"),
911 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
912 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
916 #define PRIMARY_FAULT_REG_LEN (16)
917 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
919 struct intel_iommu *iommu = dev_id;
920 int reg, fault_index;
924 spin_lock_irqsave(&iommu->register_lock, flag);
925 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
927 /* TBD: ignore advanced fault log currently */
928 if (!(fault_status & DMA_FSTS_PPF))
931 fault_index = dma_fsts_fault_record_index(fault_status);
932 reg = cap_fault_reg_offset(iommu->cap);
940 /* highest 32 bits */
941 data = readl(iommu->reg + reg +
942 fault_index * PRIMARY_FAULT_REG_LEN + 12);
943 if (!(data & DMA_FRCD_F))
946 fault_reason = dma_frcd_fault_reason(data);
947 type = dma_frcd_type(data);
949 data = readl(iommu->reg + reg +
950 fault_index * PRIMARY_FAULT_REG_LEN + 8);
951 source_id = dma_frcd_source_id(data);
953 guest_addr = dmar_readq(iommu->reg + reg +
954 fault_index * PRIMARY_FAULT_REG_LEN);
955 guest_addr = dma_frcd_page_addr(guest_addr);
956 /* clear the fault */
957 writel(DMA_FRCD_F, iommu->reg + reg +
958 fault_index * PRIMARY_FAULT_REG_LEN + 12);
960 spin_unlock_irqrestore(&iommu->register_lock, flag);
962 iommu_page_fault_do_one(iommu, type, fault_reason,
963 source_id, guest_addr);
966 if (fault_index > cap_num_fault_regs(iommu->cap))
968 spin_lock_irqsave(&iommu->register_lock, flag);
971 /* clear primary fault overflow */
972 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
973 if (fault_status & DMA_FSTS_PFO)
974 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
976 spin_unlock_irqrestore(&iommu->register_lock, flag);
980 int dmar_set_interrupt(struct intel_iommu *iommu)
986 printk(KERN_ERR "IOMMU: no free vectors\n");
990 set_irq_data(irq, iommu);
993 ret = arch_setup_dmar_msi(irq);
995 set_irq_data(irq, NULL);
1001 /* Force fault register is cleared */
1002 iommu_page_fault(irq, iommu);
1004 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1006 printk(KERN_ERR "IOMMU: can't request irq\n");
1010 static int iommu_init_domains(struct intel_iommu *iommu)
1012 unsigned long ndomains;
1013 unsigned long nlongs;
1015 ndomains = cap_ndoms(iommu->cap);
1016 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1017 nlongs = BITS_TO_LONGS(ndomains);
1019 /* TBD: there might be 64K domains,
1020 * consider other allocation for future chip
1022 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1023 if (!iommu->domain_ids) {
1024 printk(KERN_ERR "Allocating domain id array failed\n");
1027 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1029 if (!iommu->domains) {
1030 printk(KERN_ERR "Allocating domain array failed\n");
1031 kfree(iommu->domain_ids);
1035 spin_lock_init(&iommu->lock);
1038 * if Caching mode is set, then invalid translations are tagged
1039 * with domainid 0. Hence we need to pre-allocate it.
1041 if (cap_caching_mode(iommu->cap))
1042 set_bit(0, iommu->domain_ids);
1047 static void domain_exit(struct dmar_domain *domain);
1049 void free_dmar_iommu(struct intel_iommu *iommu)
1051 struct dmar_domain *domain;
1054 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1055 for (; i < cap_ndoms(iommu->cap); ) {
1056 domain = iommu->domains[i];
1057 clear_bit(i, iommu->domain_ids);
1058 domain_exit(domain);
1059 i = find_next_bit(iommu->domain_ids,
1060 cap_ndoms(iommu->cap), i+1);
1063 if (iommu->gcmd & DMA_GCMD_TE)
1064 iommu_disable_translation(iommu);
1067 set_irq_data(iommu->irq, NULL);
1068 /* This will mask the irq */
1069 free_irq(iommu->irq, iommu);
1070 destroy_irq(iommu->irq);
1073 kfree(iommu->domains);
1074 kfree(iommu->domain_ids);
1076 /* free context mapping */
1077 free_context_table(iommu);
1080 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1083 unsigned long ndomains;
1084 struct dmar_domain *domain;
1085 unsigned long flags;
1087 domain = alloc_domain_mem();
1091 ndomains = cap_ndoms(iommu->cap);
1093 spin_lock_irqsave(&iommu->lock, flags);
1094 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1095 if (num >= ndomains) {
1096 spin_unlock_irqrestore(&iommu->lock, flags);
1097 free_domain_mem(domain);
1098 printk(KERN_ERR "IOMMU: no free domain ids\n");
1102 set_bit(num, iommu->domain_ids);
1104 domain->iommu = iommu;
1105 iommu->domains[num] = domain;
1106 spin_unlock_irqrestore(&iommu->lock, flags);
1111 static void iommu_free_domain(struct dmar_domain *domain)
1113 unsigned long flags;
1115 spin_lock_irqsave(&domain->iommu->lock, flags);
1116 clear_bit(domain->id, domain->iommu->domain_ids);
1117 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1120 static struct iova_domain reserved_iova_list;
1121 static struct lock_class_key reserved_alloc_key;
1122 static struct lock_class_key reserved_rbtree_key;
1124 static void dmar_init_reserved_ranges(void)
1126 struct pci_dev *pdev = NULL;
1131 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1133 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1134 &reserved_alloc_key);
1135 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1136 &reserved_rbtree_key);
1138 /* IOAPIC ranges shouldn't be accessed by DMA */
1139 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1140 IOVA_PFN(IOAPIC_RANGE_END));
1142 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1144 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1145 for_each_pci_dev(pdev) {
1148 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1149 r = &pdev->resource[i];
1150 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1154 size = r->end - addr;
1155 size = PAGE_ALIGN(size);
1156 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1157 IOVA_PFN(size + addr) - 1);
1159 printk(KERN_ERR "Reserve iova failed\n");
1165 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1167 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1170 static inline int guestwidth_to_adjustwidth(int gaw)
1173 int r = (gaw - 12) % 9;
1184 static int domain_init(struct dmar_domain *domain, int guest_width)
1186 struct intel_iommu *iommu;
1187 int adjust_width, agaw;
1188 unsigned long sagaw;
1190 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1191 spin_lock_init(&domain->mapping_lock);
1193 domain_reserve_special_ranges(domain);
1195 /* calculate AGAW */
1196 iommu = domain->iommu;
1197 if (guest_width > cap_mgaw(iommu->cap))
1198 guest_width = cap_mgaw(iommu->cap);
1199 domain->gaw = guest_width;
1200 adjust_width = guestwidth_to_adjustwidth(guest_width);
1201 agaw = width_to_agaw(adjust_width);
1202 sagaw = cap_sagaw(iommu->cap);
1203 if (!test_bit(agaw, &sagaw)) {
1204 /* hardware doesn't support it, choose a bigger one */
1205 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1206 agaw = find_next_bit(&sagaw, 5, agaw);
1210 domain->agaw = agaw;
1211 INIT_LIST_HEAD(&domain->devices);
1213 /* always allocate the top pgd */
1214 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1217 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1221 static void domain_exit(struct dmar_domain *domain)
1225 /* Domain 0 is reserved, so dont process it */
1229 domain_remove_dev_info(domain);
1231 put_iova_domain(&domain->iovad);
1232 end = DOMAIN_MAX_ADDR(domain->gaw);
1233 end = end & (~PAGE_MASK);
1236 dma_pte_clear_range(domain, 0, end);
1238 /* free page tables */
1239 dma_pte_free_pagetable(domain, 0, end);
1241 iommu_free_domain(domain);
1242 free_domain_mem(domain);
1245 static int domain_context_mapping_one(struct dmar_domain *domain,
1248 struct context_entry *context;
1249 struct intel_iommu *iommu = domain->iommu;
1250 unsigned long flags;
1252 pr_debug("Set context mapping for %02x:%02x.%d\n",
1253 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1254 BUG_ON(!domain->pgd);
1255 context = device_to_context_entry(iommu, bus, devfn);
1258 spin_lock_irqsave(&iommu->lock, flags);
1259 if (context_present(*context)) {
1260 spin_unlock_irqrestore(&iommu->lock, flags);
1264 context_set_domain_id(*context, domain->id);
1265 context_set_address_width(*context, domain->agaw);
1266 context_set_address_root(*context, virt_to_phys(domain->pgd));
1267 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1268 context_set_fault_enable(*context);
1269 context_set_present(*context);
1270 __iommu_flush_cache(iommu, context, sizeof(*context));
1272 /* it's a non-present to present mapping */
1273 if (iommu->flush.flush_context(iommu, domain->id,
1274 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1275 DMA_CCMD_DEVICE_INVL, 1))
1276 iommu_flush_write_buffer(iommu);
1278 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1280 spin_unlock_irqrestore(&iommu->lock, flags);
1285 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1288 struct pci_dev *tmp, *parent;
1290 ret = domain_context_mapping_one(domain, pdev->bus->number,
1295 /* dependent device mapping */
1296 tmp = pci_find_upstream_pcie_bridge(pdev);
1299 /* Secondary interface's bus number and devfn 0 */
1300 parent = pdev->bus->self;
1301 while (parent != tmp) {
1302 ret = domain_context_mapping_one(domain, parent->bus->number,
1306 parent = parent->bus->self;
1308 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1309 return domain_context_mapping_one(domain,
1310 tmp->subordinate->number, 0);
1311 else /* this is a legacy PCI bridge */
1312 return domain_context_mapping_one(domain,
1313 tmp->bus->number, tmp->devfn);
1316 static int domain_context_mapped(struct dmar_domain *domain,
1317 struct pci_dev *pdev)
1320 struct pci_dev *tmp, *parent;
1322 ret = device_context_mapped(domain->iommu,
1323 pdev->bus->number, pdev->devfn);
1326 /* dependent device mapping */
1327 tmp = pci_find_upstream_pcie_bridge(pdev);
1330 /* Secondary interface's bus number and devfn 0 */
1331 parent = pdev->bus->self;
1332 while (parent != tmp) {
1333 ret = device_context_mapped(domain->iommu, parent->bus->number,
1337 parent = parent->bus->self;
1340 return device_context_mapped(domain->iommu,
1341 tmp->subordinate->number, 0);
1343 return device_context_mapped(domain->iommu,
1344 tmp->bus->number, tmp->devfn);
1348 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1349 u64 hpa, size_t size, int prot)
1351 u64 start_pfn, end_pfn;
1352 struct dma_pte *pte;
1354 int addr_width = agaw_to_width(domain->agaw);
1356 hpa &= (((u64)1) << addr_width) - 1;
1358 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1361 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1362 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1364 while (start_pfn < end_pfn) {
1365 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1368 /* We don't need lock here, nobody else
1369 * touches the iova range
1371 BUG_ON(dma_pte_addr(*pte));
1372 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1373 dma_set_pte_prot(*pte, prot);
1374 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1381 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1383 clear_context_table(domain->iommu, bus, devfn);
1384 domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1385 DMA_CCMD_GLOBAL_INVL, 0);
1386 domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1387 DMA_TLB_GLOBAL_FLUSH, 0);
1390 static void domain_remove_dev_info(struct dmar_domain *domain)
1392 struct device_domain_info *info;
1393 unsigned long flags;
1395 spin_lock_irqsave(&device_domain_lock, flags);
1396 while (!list_empty(&domain->devices)) {
1397 info = list_entry(domain->devices.next,
1398 struct device_domain_info, link);
1399 list_del(&info->link);
1400 list_del(&info->global);
1402 info->dev->dev.archdata.iommu = NULL;
1403 spin_unlock_irqrestore(&device_domain_lock, flags);
1405 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1406 free_devinfo_mem(info);
1408 spin_lock_irqsave(&device_domain_lock, flags);
1410 spin_unlock_irqrestore(&device_domain_lock, flags);
1415 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1417 static struct dmar_domain *
1418 find_domain(struct pci_dev *pdev)
1420 struct device_domain_info *info;
1422 /* No lock here, assumes no domain exit in normal case */
1423 info = pdev->dev.archdata.iommu;
1425 return info->domain;
1429 /* domain is initialized */
1430 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1432 struct dmar_domain *domain, *found = NULL;
1433 struct intel_iommu *iommu;
1434 struct dmar_drhd_unit *drhd;
1435 struct device_domain_info *info, *tmp;
1436 struct pci_dev *dev_tmp;
1437 unsigned long flags;
1438 int bus = 0, devfn = 0;
1440 domain = find_domain(pdev);
1444 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1446 if (dev_tmp->is_pcie) {
1447 bus = dev_tmp->subordinate->number;
1450 bus = dev_tmp->bus->number;
1451 devfn = dev_tmp->devfn;
1453 spin_lock_irqsave(&device_domain_lock, flags);
1454 list_for_each_entry(info, &device_domain_list, global) {
1455 if (info->bus == bus && info->devfn == devfn) {
1456 found = info->domain;
1460 spin_unlock_irqrestore(&device_domain_lock, flags);
1461 /* pcie-pci bridge already has a domain, uses it */
1468 /* Allocate new domain for the device */
1469 drhd = dmar_find_matched_drhd_unit(pdev);
1471 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1475 iommu = drhd->iommu;
1477 domain = iommu_alloc_domain(iommu);
1481 if (domain_init(domain, gaw)) {
1482 domain_exit(domain);
1486 /* register pcie-to-pci device */
1488 info = alloc_devinfo_mem();
1490 domain_exit(domain);
1494 info->devfn = devfn;
1496 info->domain = domain;
1497 /* This domain is shared by devices under p2p bridge */
1498 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1500 /* pcie-to-pci bridge already has a domain, uses it */
1502 spin_lock_irqsave(&device_domain_lock, flags);
1503 list_for_each_entry(tmp, &device_domain_list, global) {
1504 if (tmp->bus == bus && tmp->devfn == devfn) {
1505 found = tmp->domain;
1510 free_devinfo_mem(info);
1511 domain_exit(domain);
1514 list_add(&info->link, &domain->devices);
1515 list_add(&info->global, &device_domain_list);
1517 spin_unlock_irqrestore(&device_domain_lock, flags);
1521 info = alloc_devinfo_mem();
1524 info->bus = pdev->bus->number;
1525 info->devfn = pdev->devfn;
1527 info->domain = domain;
1528 spin_lock_irqsave(&device_domain_lock, flags);
1529 /* somebody is fast */
1530 found = find_domain(pdev);
1531 if (found != NULL) {
1532 spin_unlock_irqrestore(&device_domain_lock, flags);
1533 if (found != domain) {
1534 domain_exit(domain);
1537 free_devinfo_mem(info);
1540 list_add(&info->link, &domain->devices);
1541 list_add(&info->global, &device_domain_list);
1542 pdev->dev.archdata.iommu = info;
1543 spin_unlock_irqrestore(&device_domain_lock, flags);
1546 /* recheck it here, maybe others set it */
1547 return find_domain(pdev);
1550 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1551 unsigned long long start,
1552 unsigned long long end)
1554 struct dmar_domain *domain;
1556 unsigned long long base;
1560 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1561 pci_name(pdev), start, end);
1562 /* page table init */
1563 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1567 /* The address might not be aligned */
1568 base = start & PAGE_MASK;
1570 size = PAGE_ALIGN(size);
1571 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1572 IOVA_PFN(base + size) - 1)) {
1573 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1578 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1579 size, base, pci_name(pdev));
1581 * RMRR range might have overlap with physical memory range,
1584 dma_pte_clear_range(domain, base, base + size);
1586 ret = domain_page_mapping(domain, base, base, size,
1587 DMA_PTE_READ|DMA_PTE_WRITE);
1591 /* context entry init */
1592 ret = domain_context_mapping(domain, pdev);
1596 domain_exit(domain);
1601 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1602 struct pci_dev *pdev)
1604 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1606 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1607 rmrr->end_address + 1);
1610 #ifdef CONFIG_DMAR_GFX_WA
1611 struct iommu_prepare_data {
1612 struct pci_dev *pdev;
1616 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1617 unsigned long end_pfn, void *datax)
1619 struct iommu_prepare_data *data;
1621 data = (struct iommu_prepare_data *)datax;
1623 data->ret = iommu_prepare_identity_map(data->pdev,
1624 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1629 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1632 struct iommu_prepare_data data;
1637 for_each_online_node(nid) {
1638 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1645 static void __init iommu_prepare_gfx_mapping(void)
1647 struct pci_dev *pdev = NULL;
1650 for_each_pci_dev(pdev) {
1651 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1652 !IS_GFX_DEVICE(pdev))
1654 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1656 ret = iommu_prepare_with_active_regions(pdev);
1658 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1663 #ifdef CONFIG_DMAR_FLOPPY_WA
1664 static inline void iommu_prepare_isa(void)
1666 struct pci_dev *pdev;
1669 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1673 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1674 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1677 printk("IOMMU: Failed to create 0-64M identity map, "
1678 "floppy might not work\n");
1682 static inline void iommu_prepare_isa(void)
1686 #endif /* !CONFIG_DMAR_FLPY_WA */
1688 static int __init init_dmars(void)
1690 struct dmar_drhd_unit *drhd;
1691 struct dmar_rmrr_unit *rmrr;
1692 struct pci_dev *pdev;
1693 struct intel_iommu *iommu;
1694 int i, ret, unit = 0;
1699 * initialize and program root entry to not present
1702 for_each_drhd_unit(drhd) {
1705 * lock not needed as this is only incremented in the single
1706 * threaded kernel __init code path all other access are read
1711 deferred_flush = kzalloc(g_num_of_iommus *
1712 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1713 if (!deferred_flush) {
1718 for_each_drhd_unit(drhd) {
1722 iommu = drhd->iommu;
1724 ret = iommu_init_domains(iommu);
1730 * we could share the same root & context tables
1731 * amoung all IOMMU's. Need to Split it later.
1733 ret = iommu_alloc_root_entry(iommu);
1735 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1740 for_each_drhd_unit(drhd) {
1744 iommu = drhd->iommu;
1745 if (dmar_enable_qi(iommu)) {
1747 * Queued Invalidate not enabled, use Register Based
1750 iommu->flush.flush_context = __iommu_flush_context;
1751 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1752 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1754 (unsigned long long)drhd->reg_base_addr);
1756 iommu->flush.flush_context = qi_flush_context;
1757 iommu->flush.flush_iotlb = qi_flush_iotlb;
1758 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1760 (unsigned long long)drhd->reg_base_addr);
1766 * for each dev attached to rmrr
1768 * locate drhd for dev, alloc domain for dev
1769 * allocate free domain
1770 * allocate page table entries for rmrr
1771 * if context not allocated for bus
1772 * allocate and init context
1773 * set present in root table for this bus
1774 * init context with domain, translation etc
1778 for_each_rmrr_units(rmrr) {
1779 for (i = 0; i < rmrr->devices_cnt; i++) {
1780 pdev = rmrr->devices[i];
1781 /* some BIOS lists non-exist devices in DMAR table */
1784 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1787 "IOMMU: mapping reserved region failed\n");
1791 iommu_prepare_gfx_mapping();
1793 iommu_prepare_isa();
1798 * global invalidate context cache
1799 * global invalidate iotlb
1800 * enable translation
1802 for_each_drhd_unit(drhd) {
1805 iommu = drhd->iommu;
1806 sprintf (iommu->name, "dmar%d", unit++);
1808 iommu_flush_write_buffer(iommu);
1810 ret = dmar_set_interrupt(iommu);
1814 iommu_set_root_entry(iommu);
1816 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1818 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1820 iommu_disable_protect_mem_regions(iommu);
1822 ret = iommu_enable_translation(iommu);
1829 for_each_drhd_unit(drhd) {
1832 iommu = drhd->iommu;
1838 static inline u64 aligned_size(u64 host_addr, size_t size)
1841 addr = (host_addr & (~PAGE_MASK)) + size;
1842 return PAGE_ALIGN(addr);
1846 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1850 /* Make sure it's in range */
1851 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1852 if (!size || (IOVA_START_ADDR + size > end))
1855 piova = alloc_iova(&domain->iovad,
1856 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1860 static struct iova *
1861 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1862 size_t size, u64 dma_mask)
1864 struct pci_dev *pdev = to_pci_dev(dev);
1865 struct iova *iova = NULL;
1867 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1868 iova = iommu_alloc_iova(domain, size, dma_mask);
1871 * First try to allocate an io virtual address in
1872 * DMA_32BIT_MASK and if that fails then try allocating
1875 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1877 iova = iommu_alloc_iova(domain, size, dma_mask);
1881 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1888 static struct dmar_domain *
1889 get_valid_domain_for_dev(struct pci_dev *pdev)
1891 struct dmar_domain *domain;
1894 domain = get_domain_for_dev(pdev,
1895 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1898 "Allocating domain for %s failed", pci_name(pdev));
1902 /* make sure context mapping is ok */
1903 if (unlikely(!domain_context_mapped(domain, pdev))) {
1904 ret = domain_context_mapping(domain, pdev);
1907 "Domain context map for %s failed",
1916 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1917 size_t size, int dir, u64 dma_mask)
1919 struct pci_dev *pdev = to_pci_dev(hwdev);
1920 struct dmar_domain *domain;
1921 phys_addr_t start_paddr;
1926 BUG_ON(dir == DMA_NONE);
1927 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1930 domain = get_valid_domain_for_dev(pdev);
1934 size = aligned_size((u64)paddr, size);
1936 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1940 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1943 * Check if DMAR supports zero-length reads on write only
1946 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1947 !cap_zlr(domain->iommu->cap))
1948 prot |= DMA_PTE_READ;
1949 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1950 prot |= DMA_PTE_WRITE;
1952 * paddr - (paddr + size) might be partial page, we should map the whole
1953 * page. Note: if two part of one page are separately mapped, we
1954 * might have two guest_addr mapping to the same host paddr, but this
1955 * is not a big problem
1957 ret = domain_page_mapping(domain, start_paddr,
1958 ((u64)paddr) & PAGE_MASK, size, prot);
1962 /* it's a non-present to present mapping */
1963 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1964 start_paddr, size >> VTD_PAGE_SHIFT, 1);
1966 iommu_flush_write_buffer(domain->iommu);
1968 return start_paddr + ((u64)paddr & (~PAGE_MASK));
1972 __free_iova(&domain->iovad, iova);
1973 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1974 pci_name(pdev), size, (unsigned long long)paddr, dir);
1978 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
1979 size_t size, int dir)
1981 return __intel_map_single(hwdev, paddr, size, dir,
1982 to_pci_dev(hwdev)->dma_mask);
1985 static void flush_unmaps(void)
1991 /* just flush them all */
1992 for (i = 0; i < g_num_of_iommus; i++) {
1993 if (deferred_flush[i].next) {
1994 struct intel_iommu *iommu =
1995 deferred_flush[i].domain[0]->iommu;
1997 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1998 DMA_TLB_GLOBAL_FLUSH, 0);
1999 for (j = 0; j < deferred_flush[i].next; j++) {
2000 __free_iova(&deferred_flush[i].domain[j]->iovad,
2001 deferred_flush[i].iova[j]);
2003 deferred_flush[i].next = 0;
2010 static void flush_unmaps_timeout(unsigned long data)
2012 unsigned long flags;
2014 spin_lock_irqsave(&async_umap_flush_lock, flags);
2016 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2019 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2021 unsigned long flags;
2024 spin_lock_irqsave(&async_umap_flush_lock, flags);
2025 if (list_size == HIGH_WATER_MARK)
2028 iommu_id = dom->iommu->seq_id;
2030 next = deferred_flush[iommu_id].next;
2031 deferred_flush[iommu_id].domain[next] = dom;
2032 deferred_flush[iommu_id].iova[next] = iova;
2033 deferred_flush[iommu_id].next++;
2036 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2040 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2043 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2046 struct pci_dev *pdev = to_pci_dev(dev);
2047 struct dmar_domain *domain;
2048 unsigned long start_addr;
2051 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2053 domain = find_domain(pdev);
2056 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2060 start_addr = iova->pfn_lo << PAGE_SHIFT;
2061 size = aligned_size((u64)dev_addr, size);
2063 pr_debug("Device %s unmapping: %lx@%llx\n",
2064 pci_name(pdev), size, (unsigned long long)start_addr);
2066 /* clear the whole page */
2067 dma_pte_clear_range(domain, start_addr, start_addr + size);
2068 /* free page tables */
2069 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2070 if (intel_iommu_strict) {
2071 if (iommu_flush_iotlb_psi(domain->iommu,
2072 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2073 iommu_flush_write_buffer(domain->iommu);
2075 __free_iova(&domain->iovad, iova);
2077 add_unmap(domain, iova);
2079 * queue up the release of the unmap to save the 1/6th of the
2080 * cpu used up by the iotlb flush operation...
2085 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2086 dma_addr_t *dma_handle, gfp_t flags)
2091 size = PAGE_ALIGN(size);
2092 order = get_order(size);
2093 flags &= ~(GFP_DMA | GFP_DMA32);
2095 vaddr = (void *)__get_free_pages(flags, order);
2098 memset(vaddr, 0, size);
2100 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2102 hwdev->coherent_dma_mask);
2105 free_pages((unsigned long)vaddr, order);
2109 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2110 dma_addr_t dma_handle)
2114 size = PAGE_ALIGN(size);
2115 order = get_order(size);
2117 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2118 free_pages((unsigned long)vaddr, order);
2121 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2123 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2124 int nelems, int dir)
2127 struct pci_dev *pdev = to_pci_dev(hwdev);
2128 struct dmar_domain *domain;
2129 unsigned long start_addr;
2133 struct scatterlist *sg;
2135 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2138 domain = find_domain(pdev);
2140 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2143 for_each_sg(sglist, sg, nelems, i) {
2144 addr = SG_ENT_VIRT_ADDRESS(sg);
2145 size += aligned_size((u64)addr, sg->length);
2148 start_addr = iova->pfn_lo << PAGE_SHIFT;
2150 /* clear the whole page */
2151 dma_pte_clear_range(domain, start_addr, start_addr + size);
2152 /* free page tables */
2153 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2155 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2156 size >> VTD_PAGE_SHIFT, 0))
2157 iommu_flush_write_buffer(domain->iommu);
2160 __free_iova(&domain->iovad, iova);
2163 static int intel_nontranslate_map_sg(struct device *hddev,
2164 struct scatterlist *sglist, int nelems, int dir)
2167 struct scatterlist *sg;
2169 for_each_sg(sglist, sg, nelems, i) {
2170 BUG_ON(!sg_page(sg));
2171 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2172 sg->dma_length = sg->length;
2177 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2182 struct pci_dev *pdev = to_pci_dev(hwdev);
2183 struct dmar_domain *domain;
2187 struct iova *iova = NULL;
2189 struct scatterlist *sg;
2190 unsigned long start_addr;
2192 BUG_ON(dir == DMA_NONE);
2193 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2194 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2196 domain = get_valid_domain_for_dev(pdev);
2200 for_each_sg(sglist, sg, nelems, i) {
2201 addr = SG_ENT_VIRT_ADDRESS(sg);
2202 addr = (void *)virt_to_phys(addr);
2203 size += aligned_size((u64)addr, sg->length);
2206 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2208 sglist->dma_length = 0;
2213 * Check if DMAR supports zero-length reads on write only
2216 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2217 !cap_zlr(domain->iommu->cap))
2218 prot |= DMA_PTE_READ;
2219 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2220 prot |= DMA_PTE_WRITE;
2222 start_addr = iova->pfn_lo << PAGE_SHIFT;
2224 for_each_sg(sglist, sg, nelems, i) {
2225 addr = SG_ENT_VIRT_ADDRESS(sg);
2226 addr = (void *)virt_to_phys(addr);
2227 size = aligned_size((u64)addr, sg->length);
2228 ret = domain_page_mapping(domain, start_addr + offset,
2229 ((u64)addr) & PAGE_MASK,
2232 /* clear the page */
2233 dma_pte_clear_range(domain, start_addr,
2234 start_addr + offset);
2235 /* free page tables */
2236 dma_pte_free_pagetable(domain, start_addr,
2237 start_addr + offset);
2239 __free_iova(&domain->iovad, iova);
2242 sg->dma_address = start_addr + offset +
2243 ((u64)addr & (~PAGE_MASK));
2244 sg->dma_length = sg->length;
2248 /* it's a non-present to present mapping */
2249 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2250 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2251 iommu_flush_write_buffer(domain->iommu);
2255 static struct dma_mapping_ops intel_dma_ops = {
2256 .alloc_coherent = intel_alloc_coherent,
2257 .free_coherent = intel_free_coherent,
2258 .map_single = intel_map_single,
2259 .unmap_single = intel_unmap_single,
2260 .map_sg = intel_map_sg,
2261 .unmap_sg = intel_unmap_sg,
2264 static inline int iommu_domain_cache_init(void)
2268 iommu_domain_cache = kmem_cache_create("iommu_domain",
2269 sizeof(struct dmar_domain),
2274 if (!iommu_domain_cache) {
2275 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2282 static inline int iommu_devinfo_cache_init(void)
2286 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2287 sizeof(struct device_domain_info),
2291 if (!iommu_devinfo_cache) {
2292 printk(KERN_ERR "Couldn't create devinfo cache\n");
2299 static inline int iommu_iova_cache_init(void)
2303 iommu_iova_cache = kmem_cache_create("iommu_iova",
2304 sizeof(struct iova),
2308 if (!iommu_iova_cache) {
2309 printk(KERN_ERR "Couldn't create iova cache\n");
2316 static int __init iommu_init_mempool(void)
2319 ret = iommu_iova_cache_init();
2323 ret = iommu_domain_cache_init();
2327 ret = iommu_devinfo_cache_init();
2331 kmem_cache_destroy(iommu_domain_cache);
2333 kmem_cache_destroy(iommu_iova_cache);
2338 static void __init iommu_exit_mempool(void)
2340 kmem_cache_destroy(iommu_devinfo_cache);
2341 kmem_cache_destroy(iommu_domain_cache);
2342 kmem_cache_destroy(iommu_iova_cache);
2346 static void __init init_no_remapping_devices(void)
2348 struct dmar_drhd_unit *drhd;
2350 for_each_drhd_unit(drhd) {
2351 if (!drhd->include_all) {
2353 for (i = 0; i < drhd->devices_cnt; i++)
2354 if (drhd->devices[i] != NULL)
2356 /* ignore DMAR unit if no pci devices exist */
2357 if (i == drhd->devices_cnt)
2365 for_each_drhd_unit(drhd) {
2367 if (drhd->ignored || drhd->include_all)
2370 for (i = 0; i < drhd->devices_cnt; i++)
2371 if (drhd->devices[i] &&
2372 !IS_GFX_DEVICE(drhd->devices[i]))
2375 if (i < drhd->devices_cnt)
2378 /* bypass IOMMU if it is just for gfx devices */
2380 for (i = 0; i < drhd->devices_cnt; i++) {
2381 if (!drhd->devices[i])
2383 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2388 int __init intel_iommu_init(void)
2392 if (dmar_table_init())
2395 if (dmar_dev_scope_init())
2399 * Check the need for DMA-remapping initialization now.
2400 * Above initialization will also be used by Interrupt-remapping.
2402 if (no_iommu || swiotlb || dmar_disabled)
2405 iommu_init_mempool();
2406 dmar_init_reserved_ranges();
2408 init_no_remapping_devices();
2412 printk(KERN_ERR "IOMMU: dmar init failed\n");
2413 put_iova_domain(&reserved_iova_list);
2414 iommu_exit_mempool();
2418 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2420 init_timer(&unmap_timer);
2422 dma_ops = &intel_dma_ops;
2426 void intel_iommu_domain_exit(struct dmar_domain *domain)
2430 /* Domain 0 is reserved, so dont process it */
2434 end = DOMAIN_MAX_ADDR(domain->gaw);
2435 end = end & (~VTD_PAGE_MASK);
2438 dma_pte_clear_range(domain, 0, end);
2440 /* free page tables */
2441 dma_pte_free_pagetable(domain, 0, end);
2443 iommu_free_domain(domain);
2444 free_domain_mem(domain);
2446 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2448 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2450 struct dmar_drhd_unit *drhd;
2451 struct dmar_domain *domain;
2452 struct intel_iommu *iommu;
2454 drhd = dmar_find_matched_drhd_unit(pdev);
2456 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2460 iommu = drhd->iommu;
2463 "intel_iommu_domain_alloc: iommu == NULL\n");
2466 domain = iommu_alloc_domain(iommu);
2469 "intel_iommu_domain_alloc: domain == NULL\n");
2472 if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2474 "intel_iommu_domain_alloc: domain_init() failed\n");
2475 intel_iommu_domain_exit(domain);
2480 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2482 int intel_iommu_context_mapping(
2483 struct dmar_domain *domain, struct pci_dev *pdev)
2486 rc = domain_context_mapping(domain, pdev);
2489 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2491 int intel_iommu_page_mapping(
2492 struct dmar_domain *domain, dma_addr_t iova,
2493 u64 hpa, size_t size, int prot)
2496 rc = domain_page_mapping(domain, iova, hpa, size, prot);
2499 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2501 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2503 detach_domain_for_dev(domain, bus, devfn);
2505 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2507 struct dmar_domain *
2508 intel_iommu_find_domain(struct pci_dev *pdev)
2510 return find_domain(pdev);
2512 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2514 int intel_iommu_found(void)
2516 return g_num_of_iommus;
2518 EXPORT_SYMBOL_GPL(intel_iommu_found);
2520 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2522 struct dma_pte *pte;
2526 pte = addr_to_dma_pte(domain, iova);
2529 pfn = dma_pte_addr(*pte);
2531 return pfn >> VTD_PAGE_SHIFT;
2533 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);