intel-iommu: Add device info into list before doing context mapping
[firefly-linux-kernel-4.4.55.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef  CONFIG_X86
360 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
361 #else
362 # define        IOMMU_UNITS_SUPPORTED   64
363 #endif
364
365 struct dmar_domain {
366         int     id;                     /* domain id */
367         int     nid;                    /* node id */
368         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                                         /* bitmap of iommus this domain uses*/
370
371         struct list_head devices;       /* all devices' list */
372         struct iova_domain iovad;       /* iova's that belong to this domain */
373
374         struct dma_pte  *pgd;           /* virtual address */
375         int             gaw;            /* max guest address width */
376
377         /* adjusted guest address width, 0 is level 2 30-bit */
378         int             agaw;
379
380         int             flags;          /* flags to find out type of domain */
381
382         int             iommu_coherency;/* indicate coherency of iommu access */
383         int             iommu_snooping; /* indicate snooping control feature*/
384         int             iommu_count;    /* reference count of iommu */
385         int             iommu_superpage;/* Level of superpages supported:
386                                            0 == 4KiB (no superpages), 1 == 2MiB,
387                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388         spinlock_t      iommu_lock;     /* protect iommu set in domain */
389         u64             max_addr;       /* maximum mapped address */
390 };
391
392 /* PCI domain-device relationship */
393 struct device_domain_info {
394         struct list_head link;  /* link to domain siblings */
395         struct list_head global; /* link to global list */
396         int segment;            /* PCI domain */
397         u8 bus;                 /* PCI bus number */
398         u8 devfn;               /* PCI devfn number */
399         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400         struct intel_iommu *iommu; /* IOMMU used by this device */
401         struct dmar_domain *domain; /* pointer to domain */
402 };
403
404 static void flush_unmaps_timeout(unsigned long data);
405
406 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
407
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410         int next;
411         struct iova *iova[HIGH_WATER_MARK];
412         struct dmar_domain *domain[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434 int intel_iommu_enabled = 0;
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 static struct iommu_ops intel_iommu_ops;
450
451 static int __init intel_iommu_setup(char *str)
452 {
453         if (!str)
454                 return -EINVAL;
455         while (*str) {
456                 if (!strncmp(str, "on", 2)) {
457                         dmar_disabled = 0;
458                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
459                 } else if (!strncmp(str, "off", 3)) {
460                         dmar_disabled = 1;
461                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable GFX device mapping\n");
466                 } else if (!strncmp(str, "forcedac", 8)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
469                         dmar_forcedac = 1;
470                 } else if (!strncmp(str, "strict", 6)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: disable batched IOTLB flush\n");
473                         intel_iommu_strict = 1;
474                 } else if (!strncmp(str, "sp_off", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable supported super page\n");
477                         intel_iommu_superpage = 0;
478                 }
479
480                 str += strcspn(str, ",");
481                 while (*str == ',')
482                         str++;
483         }
484         return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491
492 static inline void *alloc_pgtable_page(int node)
493 {
494         struct page *page;
495         void *vaddr = NULL;
496
497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498         if (page)
499                 vaddr = page_address(page);
500         return vaddr;
501 }
502
503 static inline void free_pgtable_page(void *vaddr)
504 {
505         free_page((unsigned long)vaddr);
506 }
507
508 static inline void *alloc_domain_mem(void)
509 {
510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512
513 static void free_domain_mem(void *vaddr)
514 {
515         kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517
518 static inline void * alloc_devinfo_mem(void)
519 {
520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527
528 struct iova *alloc_iova_mem(void)
529 {
530         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532
533 void free_iova_mem(struct iova *iova)
534 {
535         kmem_cache_free(iommu_iova_cache, iova);
536 }
537
538
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541         unsigned long sagaw;
542         int agaw = -1;
543
544         sagaw = cap_sagaw(iommu->cap);
545         for (agaw = width_to_agaw(max_gaw);
546              agaw >= 0; agaw--) {
547                 if (test_bit(agaw, &sagaw))
548                         break;
549         }
550
551         return agaw;
552 }
553
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
557 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 {
559         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
567 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 {
569         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575         int iommu_id;
576
577         /* si_domain and vm domain should not get here. */
578         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583                 return NULL;
584
585         return g_iommus[iommu_id];
586 }
587
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590         int i;
591
592         domain->iommu_coherency = 1;
593
594         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
595                 if (!ecap_coherent(g_iommus[i]->ecap)) {
596                         domain->iommu_coherency = 0;
597                         break;
598                 }
599         }
600 }
601
602 static void domain_update_iommu_snooping(struct dmar_domain *domain)
603 {
604         int i;
605
606         domain->iommu_snooping = 1;
607
608         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
609                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
610                         domain->iommu_snooping = 0;
611                         break;
612                 }
613         }
614 }
615
616 static void domain_update_iommu_superpage(struct dmar_domain *domain)
617 {
618         struct dmar_drhd_unit *drhd;
619         struct intel_iommu *iommu = NULL;
620         int mask = 0xf;
621
622         if (!intel_iommu_superpage) {
623                 domain->iommu_superpage = 0;
624                 return;
625         }
626
627         /* set iommu_superpage to the smallest common denominator */
628         for_each_active_iommu(iommu, drhd) {
629                 mask &= cap_super_page_val(iommu->cap);
630                 if (!mask) {
631                         break;
632                 }
633         }
634         domain->iommu_superpage = fls(mask);
635 }
636
637 /* Some capabilities may be different across iommus */
638 static void domain_update_iommu_cap(struct dmar_domain *domain)
639 {
640         domain_update_iommu_coherency(domain);
641         domain_update_iommu_snooping(domain);
642         domain_update_iommu_superpage(domain);
643 }
644
645 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
646 {
647         struct dmar_drhd_unit *drhd = NULL;
648         int i;
649
650         for_each_drhd_unit(drhd) {
651                 if (drhd->ignored)
652                         continue;
653                 if (segment != drhd->segment)
654                         continue;
655
656                 for (i = 0; i < drhd->devices_cnt; i++) {
657                         if (drhd->devices[i] &&
658                             drhd->devices[i]->bus->number == bus &&
659                             drhd->devices[i]->devfn == devfn)
660                                 return drhd->iommu;
661                         if (drhd->devices[i] &&
662                             drhd->devices[i]->subordinate &&
663                             drhd->devices[i]->subordinate->number <= bus &&
664                             drhd->devices[i]->subordinate->subordinate >= bus)
665                                 return drhd->iommu;
666                 }
667
668                 if (drhd->include_all)
669                         return drhd->iommu;
670         }
671
672         return NULL;
673 }
674
675 static void domain_flush_cache(struct dmar_domain *domain,
676                                void *addr, int size)
677 {
678         if (!domain->iommu_coherency)
679                 clflush_cache_range(addr, size);
680 }
681
682 /* Gets context entry for a given bus and devfn */
683 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
684                 u8 bus, u8 devfn)
685 {
686         struct root_entry *root;
687         struct context_entry *context;
688         unsigned long phy_addr;
689         unsigned long flags;
690
691         spin_lock_irqsave(&iommu->lock, flags);
692         root = &iommu->root_entry[bus];
693         context = get_context_addr_from_root(root);
694         if (!context) {
695                 context = (struct context_entry *)
696                                 alloc_pgtable_page(iommu->node);
697                 if (!context) {
698                         spin_unlock_irqrestore(&iommu->lock, flags);
699                         return NULL;
700                 }
701                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
702                 phy_addr = virt_to_phys((void *)context);
703                 set_root_value(root, phy_addr);
704                 set_root_present(root);
705                 __iommu_flush_cache(iommu, root, sizeof(*root));
706         }
707         spin_unlock_irqrestore(&iommu->lock, flags);
708         return &context[devfn];
709 }
710
711 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
712 {
713         struct root_entry *root;
714         struct context_entry *context;
715         int ret;
716         unsigned long flags;
717
718         spin_lock_irqsave(&iommu->lock, flags);
719         root = &iommu->root_entry[bus];
720         context = get_context_addr_from_root(root);
721         if (!context) {
722                 ret = 0;
723                 goto out;
724         }
725         ret = context_present(&context[devfn]);
726 out:
727         spin_unlock_irqrestore(&iommu->lock, flags);
728         return ret;
729 }
730
731 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
732 {
733         struct root_entry *root;
734         struct context_entry *context;
735         unsigned long flags;
736
737         spin_lock_irqsave(&iommu->lock, flags);
738         root = &iommu->root_entry[bus];
739         context = get_context_addr_from_root(root);
740         if (context) {
741                 context_clear_entry(&context[devfn]);
742                 __iommu_flush_cache(iommu, &context[devfn], \
743                         sizeof(*context));
744         }
745         spin_unlock_irqrestore(&iommu->lock, flags);
746 }
747
748 static void free_context_table(struct intel_iommu *iommu)
749 {
750         struct root_entry *root;
751         int i;
752         unsigned long flags;
753         struct context_entry *context;
754
755         spin_lock_irqsave(&iommu->lock, flags);
756         if (!iommu->root_entry) {
757                 goto out;
758         }
759         for (i = 0; i < ROOT_ENTRY_NR; i++) {
760                 root = &iommu->root_entry[i];
761                 context = get_context_addr_from_root(root);
762                 if (context)
763                         free_pgtable_page(context);
764         }
765         free_pgtable_page(iommu->root_entry);
766         iommu->root_entry = NULL;
767 out:
768         spin_unlock_irqrestore(&iommu->lock, flags);
769 }
770
771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772                                       unsigned long pfn, int target_level)
773 {
774         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
775         struct dma_pte *parent, *pte = NULL;
776         int level = agaw_to_level(domain->agaw);
777         int offset;
778
779         BUG_ON(!domain->pgd);
780         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
781         parent = domain->pgd;
782
783         while (level > 0) {
784                 void *tmp_page;
785
786                 offset = pfn_level_offset(pfn, level);
787                 pte = &parent[offset];
788                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
789                         break;
790                 if (level == target_level)
791                         break;
792
793                 if (!dma_pte_present(pte)) {
794                         uint64_t pteval;
795
796                         tmp_page = alloc_pgtable_page(domain->nid);
797
798                         if (!tmp_page)
799                                 return NULL;
800
801                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
802                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
803                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
804                                 /* Someone else set it while we were thinking; use theirs. */
805                                 free_pgtable_page(tmp_page);
806                         } else {
807                                 dma_pte_addr(pte);
808                                 domain_flush_cache(domain, pte, sizeof(*pte));
809                         }
810                 }
811                 parent = phys_to_virt(dma_pte_addr(pte));
812                 level--;
813         }
814
815         return pte;
816 }
817
818
819 /* return address's pte at specific level */
820 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
821                                          unsigned long pfn,
822                                          int level, int *large_page)
823 {
824         struct dma_pte *parent, *pte = NULL;
825         int total = agaw_to_level(domain->agaw);
826         int offset;
827
828         parent = domain->pgd;
829         while (level <= total) {
830                 offset = pfn_level_offset(pfn, total);
831                 pte = &parent[offset];
832                 if (level == total)
833                         return pte;
834
835                 if (!dma_pte_present(pte)) {
836                         *large_page = total;
837                         break;
838                 }
839
840                 if (pte->val & DMA_PTE_LARGE_PAGE) {
841                         *large_page = total;
842                         return pte;
843                 }
844
845                 parent = phys_to_virt(dma_pte_addr(pte));
846                 total--;
847         }
848         return NULL;
849 }
850
851 /* clear last level pte, a tlb flush should be followed */
852 static int dma_pte_clear_range(struct dmar_domain *domain,
853                                 unsigned long start_pfn,
854                                 unsigned long last_pfn)
855 {
856         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
857         unsigned int large_page = 1;
858         struct dma_pte *first_pte, *pte;
859         int order;
860
861         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
862         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
863         BUG_ON(start_pfn > last_pfn);
864
865         /* we don't need lock here; nobody else touches the iova range */
866         do {
867                 large_page = 1;
868                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
869                 if (!pte) {
870                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
871                         continue;
872                 }
873                 do {
874                         dma_clear_pte(pte);
875                         start_pfn += lvl_to_nr_pages(large_page);
876                         pte++;
877                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
878
879                 domain_flush_cache(domain, first_pte,
880                                    (void *)pte - (void *)first_pte);
881
882         } while (start_pfn && start_pfn <= last_pfn);
883
884         order = (large_page - 1) * 9;
885         return order;
886 }
887
888 /* free page table pages. last level pte should already be cleared */
889 static void dma_pte_free_pagetable(struct dmar_domain *domain,
890                                    unsigned long start_pfn,
891                                    unsigned long last_pfn)
892 {
893         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
894         struct dma_pte *first_pte, *pte;
895         int total = agaw_to_level(domain->agaw);
896         int level;
897         unsigned long tmp;
898         int large_page = 2;
899
900         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
901         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
902         BUG_ON(start_pfn > last_pfn);
903
904         /* We don't need lock here; nobody else touches the iova range */
905         level = 2;
906         while (level <= total) {
907                 tmp = align_to_level(start_pfn, level);
908
909                 /* If we can't even clear one PTE at this level, we're done */
910                 if (tmp + level_size(level) - 1 > last_pfn)
911                         return;
912
913                 do {
914                         large_page = level;
915                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
916                         if (large_page > level)
917                                 level = large_page + 1;
918                         if (!pte) {
919                                 tmp = align_to_level(tmp + 1, level + 1);
920                                 continue;
921                         }
922                         do {
923                                 if (dma_pte_present(pte)) {
924                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
925                                         dma_clear_pte(pte);
926                                 }
927                                 pte++;
928                                 tmp += level_size(level);
929                         } while (!first_pte_in_page(pte) &&
930                                  tmp + level_size(level) - 1 <= last_pfn);
931
932                         domain_flush_cache(domain, first_pte,
933                                            (void *)pte - (void *)first_pte);
934                         
935                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
936                 level++;
937         }
938         /* free pgd */
939         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
940                 free_pgtable_page(domain->pgd);
941                 domain->pgd = NULL;
942         }
943 }
944
945 /* iommu handling */
946 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
947 {
948         struct root_entry *root;
949         unsigned long flags;
950
951         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
952         if (!root)
953                 return -ENOMEM;
954
955         __iommu_flush_cache(iommu, root, ROOT_SIZE);
956
957         spin_lock_irqsave(&iommu->lock, flags);
958         iommu->root_entry = root;
959         spin_unlock_irqrestore(&iommu->lock, flags);
960
961         return 0;
962 }
963
964 static void iommu_set_root_entry(struct intel_iommu *iommu)
965 {
966         void *addr;
967         u32 sts;
968         unsigned long flag;
969
970         addr = iommu->root_entry;
971
972         raw_spin_lock_irqsave(&iommu->register_lock, flag);
973         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
974
975         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
976
977         /* Make sure hardware complete it */
978         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
979                       readl, (sts & DMA_GSTS_RTPS), sts);
980
981         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
982 }
983
984 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
985 {
986         u32 val;
987         unsigned long flag;
988
989         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
990                 return;
991
992         raw_spin_lock_irqsave(&iommu->register_lock, flag);
993         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
994
995         /* Make sure hardware complete it */
996         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
997                       readl, (!(val & DMA_GSTS_WBFS)), val);
998
999         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1000 }
1001
1002 /* return value determine if we need a write buffer flush */
1003 static void __iommu_flush_context(struct intel_iommu *iommu,
1004                                   u16 did, u16 source_id, u8 function_mask,
1005                                   u64 type)
1006 {
1007         u64 val = 0;
1008         unsigned long flag;
1009
1010         switch (type) {
1011         case DMA_CCMD_GLOBAL_INVL:
1012                 val = DMA_CCMD_GLOBAL_INVL;
1013                 break;
1014         case DMA_CCMD_DOMAIN_INVL:
1015                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1016                 break;
1017         case DMA_CCMD_DEVICE_INVL:
1018                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1019                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1020                 break;
1021         default:
1022                 BUG();
1023         }
1024         val |= DMA_CCMD_ICC;
1025
1026         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1027         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1028
1029         /* Make sure hardware complete it */
1030         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1031                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1032
1033         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1034 }
1035
1036 /* return value determine if we need a write buffer flush */
1037 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1038                                 u64 addr, unsigned int size_order, u64 type)
1039 {
1040         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1041         u64 val = 0, val_iva = 0;
1042         unsigned long flag;
1043
1044         switch (type) {
1045         case DMA_TLB_GLOBAL_FLUSH:
1046                 /* global flush doesn't need set IVA_REG */
1047                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1048                 break;
1049         case DMA_TLB_DSI_FLUSH:
1050                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1051                 break;
1052         case DMA_TLB_PSI_FLUSH:
1053                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054                 /* Note: always flush non-leaf currently */
1055                 val_iva = size_order | addr;
1056                 break;
1057         default:
1058                 BUG();
1059         }
1060         /* Note: set drain read/write */
1061 #if 0
1062         /*
1063          * This is probably to be super secure.. Looks like we can
1064          * ignore it without any impact.
1065          */
1066         if (cap_read_drain(iommu->cap))
1067                 val |= DMA_TLB_READ_DRAIN;
1068 #endif
1069         if (cap_write_drain(iommu->cap))
1070                 val |= DMA_TLB_WRITE_DRAIN;
1071
1072         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073         /* Note: Only uses first TLB reg currently */
1074         if (val_iva)
1075                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1076         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1077
1078         /* Make sure hardware complete it */
1079         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1080                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1081
1082         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1083
1084         /* check IOTLB invalidation granularity */
1085         if (DMA_TLB_IAIG(val) == 0)
1086                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1087         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1088                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1089                         (unsigned long long)DMA_TLB_IIRG(type),
1090                         (unsigned long long)DMA_TLB_IAIG(val));
1091 }
1092
1093 static struct device_domain_info *iommu_support_dev_iotlb(
1094         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1095 {
1096         int found = 0;
1097         unsigned long flags;
1098         struct device_domain_info *info;
1099         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1100
1101         if (!ecap_dev_iotlb_support(iommu->ecap))
1102                 return NULL;
1103
1104         if (!iommu->qi)
1105                 return NULL;
1106
1107         spin_lock_irqsave(&device_domain_lock, flags);
1108         list_for_each_entry(info, &domain->devices, link)
1109                 if (info->bus == bus && info->devfn == devfn) {
1110                         found = 1;
1111                         break;
1112                 }
1113         spin_unlock_irqrestore(&device_domain_lock, flags);
1114
1115         if (!found || !info->dev)
1116                 return NULL;
1117
1118         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1119                 return NULL;
1120
1121         if (!dmar_find_matched_atsr_unit(info->dev))
1122                 return NULL;
1123
1124         info->iommu = iommu;
1125
1126         return info;
1127 }
1128
1129 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1130 {
1131         if (!info)
1132                 return;
1133
1134         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1135 }
1136
1137 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1138 {
1139         if (!info->dev || !pci_ats_enabled(info->dev))
1140                 return;
1141
1142         pci_disable_ats(info->dev);
1143 }
1144
1145 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1146                                   u64 addr, unsigned mask)
1147 {
1148         u16 sid, qdep;
1149         unsigned long flags;
1150         struct device_domain_info *info;
1151
1152         spin_lock_irqsave(&device_domain_lock, flags);
1153         list_for_each_entry(info, &domain->devices, link) {
1154                 if (!info->dev || !pci_ats_enabled(info->dev))
1155                         continue;
1156
1157                 sid = info->bus << 8 | info->devfn;
1158                 qdep = pci_ats_queue_depth(info->dev);
1159                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1160         }
1161         spin_unlock_irqrestore(&device_domain_lock, flags);
1162 }
1163
1164 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1165                                   unsigned long pfn, unsigned int pages, int map)
1166 {
1167         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1168         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1169
1170         BUG_ON(pages == 0);
1171
1172         /*
1173          * Fallback to domain selective flush if no PSI support or the size is
1174          * too big.
1175          * PSI requires page size to be 2 ^ x, and the base address is naturally
1176          * aligned to the size
1177          */
1178         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1179                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1180                                                 DMA_TLB_DSI_FLUSH);
1181         else
1182                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1183                                                 DMA_TLB_PSI_FLUSH);
1184
1185         /*
1186          * In caching mode, changes of pages from non-present to present require
1187          * flush. However, device IOTLB doesn't need to be flushed in this case.
1188          */
1189         if (!cap_caching_mode(iommu->cap) || !map)
1190                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1191 }
1192
1193 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1194 {
1195         u32 pmen;
1196         unsigned long flags;
1197
1198         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1199         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1200         pmen &= ~DMA_PMEN_EPM;
1201         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1202
1203         /* wait for the protected region status bit to clear */
1204         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1205                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1206
1207         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1208 }
1209
1210 static int iommu_enable_translation(struct intel_iommu *iommu)
1211 {
1212         u32 sts;
1213         unsigned long flags;
1214
1215         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1216         iommu->gcmd |= DMA_GCMD_TE;
1217         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1218
1219         /* Make sure hardware complete it */
1220         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1221                       readl, (sts & DMA_GSTS_TES), sts);
1222
1223         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1224         return 0;
1225 }
1226
1227 static int iommu_disable_translation(struct intel_iommu *iommu)
1228 {
1229         u32 sts;
1230         unsigned long flag;
1231
1232         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233         iommu->gcmd &= ~DMA_GCMD_TE;
1234         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1235
1236         /* Make sure hardware complete it */
1237         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238                       readl, (!(sts & DMA_GSTS_TES)), sts);
1239
1240         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241         return 0;
1242 }
1243
1244
1245 static int iommu_init_domains(struct intel_iommu *iommu)
1246 {
1247         unsigned long ndomains;
1248         unsigned long nlongs;
1249
1250         ndomains = cap_ndoms(iommu->cap);
1251         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1252                         ndomains);
1253         nlongs = BITS_TO_LONGS(ndomains);
1254
1255         spin_lock_init(&iommu->lock);
1256
1257         /* TBD: there might be 64K domains,
1258          * consider other allocation for future chip
1259          */
1260         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1261         if (!iommu->domain_ids) {
1262                 printk(KERN_ERR "Allocating domain id array failed\n");
1263                 return -ENOMEM;
1264         }
1265         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1266                         GFP_KERNEL);
1267         if (!iommu->domains) {
1268                 printk(KERN_ERR "Allocating domain array failed\n");
1269                 return -ENOMEM;
1270         }
1271
1272         /*
1273          * if Caching mode is set, then invalid translations are tagged
1274          * with domainid 0. Hence we need to pre-allocate it.
1275          */
1276         if (cap_caching_mode(iommu->cap))
1277                 set_bit(0, iommu->domain_ids);
1278         return 0;
1279 }
1280
1281
1282 static void domain_exit(struct dmar_domain *domain);
1283 static void vm_domain_exit(struct dmar_domain *domain);
1284
1285 void free_dmar_iommu(struct intel_iommu *iommu)
1286 {
1287         struct dmar_domain *domain;
1288         int i;
1289         unsigned long flags;
1290
1291         if ((iommu->domains) && (iommu->domain_ids)) {
1292                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1293                         domain = iommu->domains[i];
1294                         clear_bit(i, iommu->domain_ids);
1295
1296                         spin_lock_irqsave(&domain->iommu_lock, flags);
1297                         if (--domain->iommu_count == 0) {
1298                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1299                                         vm_domain_exit(domain);
1300                                 else
1301                                         domain_exit(domain);
1302                         }
1303                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1304                 }
1305         }
1306
1307         if (iommu->gcmd & DMA_GCMD_TE)
1308                 iommu_disable_translation(iommu);
1309
1310         if (iommu->irq) {
1311                 irq_set_handler_data(iommu->irq, NULL);
1312                 /* This will mask the irq */
1313                 free_irq(iommu->irq, iommu);
1314                 destroy_irq(iommu->irq);
1315         }
1316
1317         kfree(iommu->domains);
1318         kfree(iommu->domain_ids);
1319
1320         g_iommus[iommu->seq_id] = NULL;
1321
1322         /* if all iommus are freed, free g_iommus */
1323         for (i = 0; i < g_num_of_iommus; i++) {
1324                 if (g_iommus[i])
1325                         break;
1326         }
1327
1328         if (i == g_num_of_iommus)
1329                 kfree(g_iommus);
1330
1331         /* free context mapping */
1332         free_context_table(iommu);
1333 }
1334
1335 static struct dmar_domain *alloc_domain(void)
1336 {
1337         struct dmar_domain *domain;
1338
1339         domain = alloc_domain_mem();
1340         if (!domain)
1341                 return NULL;
1342
1343         domain->nid = -1;
1344         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1345         domain->flags = 0;
1346
1347         return domain;
1348 }
1349
1350 static int iommu_attach_domain(struct dmar_domain *domain,
1351                                struct intel_iommu *iommu)
1352 {
1353         int num;
1354         unsigned long ndomains;
1355         unsigned long flags;
1356
1357         ndomains = cap_ndoms(iommu->cap);
1358
1359         spin_lock_irqsave(&iommu->lock, flags);
1360
1361         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1362         if (num >= ndomains) {
1363                 spin_unlock_irqrestore(&iommu->lock, flags);
1364                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1365                 return -ENOMEM;
1366         }
1367
1368         domain->id = num;
1369         set_bit(num, iommu->domain_ids);
1370         set_bit(iommu->seq_id, domain->iommu_bmp);
1371         iommu->domains[num] = domain;
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373
1374         return 0;
1375 }
1376
1377 static void iommu_detach_domain(struct dmar_domain *domain,
1378                                 struct intel_iommu *iommu)
1379 {
1380         unsigned long flags;
1381         int num, ndomains;
1382         int found = 0;
1383
1384         spin_lock_irqsave(&iommu->lock, flags);
1385         ndomains = cap_ndoms(iommu->cap);
1386         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1387                 if (iommu->domains[num] == domain) {
1388                         found = 1;
1389                         break;
1390                 }
1391         }
1392
1393         if (found) {
1394                 clear_bit(num, iommu->domain_ids);
1395                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1396                 iommu->domains[num] = NULL;
1397         }
1398         spin_unlock_irqrestore(&iommu->lock, flags);
1399 }
1400
1401 static struct iova_domain reserved_iova_list;
1402 static struct lock_class_key reserved_rbtree_key;
1403
1404 static int dmar_init_reserved_ranges(void)
1405 {
1406         struct pci_dev *pdev = NULL;
1407         struct iova *iova;
1408         int i;
1409
1410         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1411
1412         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1413                 &reserved_rbtree_key);
1414
1415         /* IOAPIC ranges shouldn't be accessed by DMA */
1416         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1417                 IOVA_PFN(IOAPIC_RANGE_END));
1418         if (!iova) {
1419                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1420                 return -ENODEV;
1421         }
1422
1423         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1424         for_each_pci_dev(pdev) {
1425                 struct resource *r;
1426
1427                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1428                         r = &pdev->resource[i];
1429                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1430                                 continue;
1431                         iova = reserve_iova(&reserved_iova_list,
1432                                             IOVA_PFN(r->start),
1433                                             IOVA_PFN(r->end));
1434                         if (!iova) {
1435                                 printk(KERN_ERR "Reserve iova failed\n");
1436                                 return -ENODEV;
1437                         }
1438                 }
1439         }
1440         return 0;
1441 }
1442
1443 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1444 {
1445         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1446 }
1447
1448 static inline int guestwidth_to_adjustwidth(int gaw)
1449 {
1450         int agaw;
1451         int r = (gaw - 12) % 9;
1452
1453         if (r == 0)
1454                 agaw = gaw;
1455         else
1456                 agaw = gaw + 9 - r;
1457         if (agaw > 64)
1458                 agaw = 64;
1459         return agaw;
1460 }
1461
1462 static int domain_init(struct dmar_domain *domain, int guest_width)
1463 {
1464         struct intel_iommu *iommu;
1465         int adjust_width, agaw;
1466         unsigned long sagaw;
1467
1468         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1469         spin_lock_init(&domain->iommu_lock);
1470
1471         domain_reserve_special_ranges(domain);
1472
1473         /* calculate AGAW */
1474         iommu = domain_get_iommu(domain);
1475         if (guest_width > cap_mgaw(iommu->cap))
1476                 guest_width = cap_mgaw(iommu->cap);
1477         domain->gaw = guest_width;
1478         adjust_width = guestwidth_to_adjustwidth(guest_width);
1479         agaw = width_to_agaw(adjust_width);
1480         sagaw = cap_sagaw(iommu->cap);
1481         if (!test_bit(agaw, &sagaw)) {
1482                 /* hardware doesn't support it, choose a bigger one */
1483                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1484                 agaw = find_next_bit(&sagaw, 5, agaw);
1485                 if (agaw >= 5)
1486                         return -ENODEV;
1487         }
1488         domain->agaw = agaw;
1489         INIT_LIST_HEAD(&domain->devices);
1490
1491         if (ecap_coherent(iommu->ecap))
1492                 domain->iommu_coherency = 1;
1493         else
1494                 domain->iommu_coherency = 0;
1495
1496         if (ecap_sc_support(iommu->ecap))
1497                 domain->iommu_snooping = 1;
1498         else
1499                 domain->iommu_snooping = 0;
1500
1501         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1502         domain->iommu_count = 1;
1503         domain->nid = iommu->node;
1504
1505         /* always allocate the top pgd */
1506         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1507         if (!domain->pgd)
1508                 return -ENOMEM;
1509         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1510         return 0;
1511 }
1512
1513 static void domain_exit(struct dmar_domain *domain)
1514 {
1515         struct dmar_drhd_unit *drhd;
1516         struct intel_iommu *iommu;
1517
1518         /* Domain 0 is reserved, so dont process it */
1519         if (!domain)
1520                 return;
1521
1522         /* Flush any lazy unmaps that may reference this domain */
1523         if (!intel_iommu_strict)
1524                 flush_unmaps_timeout(0);
1525
1526         domain_remove_dev_info(domain);
1527         /* destroy iovas */
1528         put_iova_domain(&domain->iovad);
1529
1530         /* clear ptes */
1531         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1532
1533         /* free page tables */
1534         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536         for_each_active_iommu(iommu, drhd)
1537                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1538                         iommu_detach_domain(domain, iommu);
1539
1540         free_domain_mem(domain);
1541 }
1542
1543 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1544                                  u8 bus, u8 devfn, int translation)
1545 {
1546         struct context_entry *context;
1547         unsigned long flags;
1548         struct intel_iommu *iommu;
1549         struct dma_pte *pgd;
1550         unsigned long num;
1551         unsigned long ndomains;
1552         int id;
1553         int agaw;
1554         struct device_domain_info *info = NULL;
1555
1556         pr_debug("Set context mapping for %02x:%02x.%d\n",
1557                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1558
1559         BUG_ON(!domain->pgd);
1560         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1561                translation != CONTEXT_TT_MULTI_LEVEL);
1562
1563         iommu = device_to_iommu(segment, bus, devfn);
1564         if (!iommu)
1565                 return -ENODEV;
1566
1567         context = device_to_context_entry(iommu, bus, devfn);
1568         if (!context)
1569                 return -ENOMEM;
1570         spin_lock_irqsave(&iommu->lock, flags);
1571         if (context_present(context)) {
1572                 spin_unlock_irqrestore(&iommu->lock, flags);
1573                 return 0;
1574         }
1575
1576         id = domain->id;
1577         pgd = domain->pgd;
1578
1579         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1580             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1581                 int found = 0;
1582
1583                 /* find an available domain id for this device in iommu */
1584                 ndomains = cap_ndoms(iommu->cap);
1585                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1586                         if (iommu->domains[num] == domain) {
1587                                 id = num;
1588                                 found = 1;
1589                                 break;
1590                         }
1591                 }
1592
1593                 if (found == 0) {
1594                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1595                         if (num >= ndomains) {
1596                                 spin_unlock_irqrestore(&iommu->lock, flags);
1597                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1598                                 return -EFAULT;
1599                         }
1600
1601                         set_bit(num, iommu->domain_ids);
1602                         iommu->domains[num] = domain;
1603                         id = num;
1604                 }
1605
1606                 /* Skip top levels of page tables for
1607                  * iommu which has less agaw than default.
1608                  * Unnecessary for PT mode.
1609                  */
1610                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1611                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1612                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1613                                 if (!dma_pte_present(pgd)) {
1614                                         spin_unlock_irqrestore(&iommu->lock, flags);
1615                                         return -ENOMEM;
1616                                 }
1617                         }
1618                 }
1619         }
1620
1621         context_set_domain_id(context, id);
1622
1623         if (translation != CONTEXT_TT_PASS_THROUGH) {
1624                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1625                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1626                                      CONTEXT_TT_MULTI_LEVEL;
1627         }
1628         /*
1629          * In pass through mode, AW must be programmed to indicate the largest
1630          * AGAW value supported by hardware. And ASR is ignored by hardware.
1631          */
1632         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1633                 context_set_address_width(context, iommu->msagaw);
1634         else {
1635                 context_set_address_root(context, virt_to_phys(pgd));
1636                 context_set_address_width(context, iommu->agaw);
1637         }
1638
1639         context_set_translation_type(context, translation);
1640         context_set_fault_enable(context);
1641         context_set_present(context);
1642         domain_flush_cache(domain, context, sizeof(*context));
1643
1644         /*
1645          * It's a non-present to present mapping. If hardware doesn't cache
1646          * non-present entry we only need to flush the write-buffer. If the
1647          * _does_ cache non-present entries, then it does so in the special
1648          * domain #0, which we have to flush:
1649          */
1650         if (cap_caching_mode(iommu->cap)) {
1651                 iommu->flush.flush_context(iommu, 0,
1652                                            (((u16)bus) << 8) | devfn,
1653                                            DMA_CCMD_MASK_NOBIT,
1654                                            DMA_CCMD_DEVICE_INVL);
1655                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1656         } else {
1657                 iommu_flush_write_buffer(iommu);
1658         }
1659         iommu_enable_dev_iotlb(info);
1660         spin_unlock_irqrestore(&iommu->lock, flags);
1661
1662         spin_lock_irqsave(&domain->iommu_lock, flags);
1663         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1664                 domain->iommu_count++;
1665                 if (domain->iommu_count == 1)
1666                         domain->nid = iommu->node;
1667                 domain_update_iommu_cap(domain);
1668         }
1669         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1670         return 0;
1671 }
1672
1673 static int
1674 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1675                         int translation)
1676 {
1677         int ret;
1678         struct pci_dev *tmp, *parent;
1679
1680         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1681                                          pdev->bus->number, pdev->devfn,
1682                                          translation);
1683         if (ret)
1684                 return ret;
1685
1686         /* dependent device mapping */
1687         tmp = pci_find_upstream_pcie_bridge(pdev);
1688         if (!tmp)
1689                 return 0;
1690         /* Secondary interface's bus number and devfn 0 */
1691         parent = pdev->bus->self;
1692         while (parent != tmp) {
1693                 ret = domain_context_mapping_one(domain,
1694                                                  pci_domain_nr(parent->bus),
1695                                                  parent->bus->number,
1696                                                  parent->devfn, translation);
1697                 if (ret)
1698                         return ret;
1699                 parent = parent->bus->self;
1700         }
1701         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1702                 return domain_context_mapping_one(domain,
1703                                         pci_domain_nr(tmp->subordinate),
1704                                         tmp->subordinate->number, 0,
1705                                         translation);
1706         else /* this is a legacy PCI bridge */
1707                 return domain_context_mapping_one(domain,
1708                                                   pci_domain_nr(tmp->bus),
1709                                                   tmp->bus->number,
1710                                                   tmp->devfn,
1711                                                   translation);
1712 }
1713
1714 static int domain_context_mapped(struct pci_dev *pdev)
1715 {
1716         int ret;
1717         struct pci_dev *tmp, *parent;
1718         struct intel_iommu *iommu;
1719
1720         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1721                                 pdev->devfn);
1722         if (!iommu)
1723                 return -ENODEV;
1724
1725         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1726         if (!ret)
1727                 return ret;
1728         /* dependent device mapping */
1729         tmp = pci_find_upstream_pcie_bridge(pdev);
1730         if (!tmp)
1731                 return ret;
1732         /* Secondary interface's bus number and devfn 0 */
1733         parent = pdev->bus->self;
1734         while (parent != tmp) {
1735                 ret = device_context_mapped(iommu, parent->bus->number,
1736                                             parent->devfn);
1737                 if (!ret)
1738                         return ret;
1739                 parent = parent->bus->self;
1740         }
1741         if (pci_is_pcie(tmp))
1742                 return device_context_mapped(iommu, tmp->subordinate->number,
1743                                              0);
1744         else
1745                 return device_context_mapped(iommu, tmp->bus->number,
1746                                              tmp->devfn);
1747 }
1748
1749 /* Returns a number of VTD pages, but aligned to MM page size */
1750 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1751                                             size_t size)
1752 {
1753         host_addr &= ~PAGE_MASK;
1754         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1755 }
1756
1757 /* Return largest possible superpage level for a given mapping */
1758 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1759                                           unsigned long iov_pfn,
1760                                           unsigned long phy_pfn,
1761                                           unsigned long pages)
1762 {
1763         int support, level = 1;
1764         unsigned long pfnmerge;
1765
1766         support = domain->iommu_superpage;
1767
1768         /* To use a large page, the virtual *and* physical addresses
1769            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1770            of them will mean we have to use smaller pages. So just
1771            merge them and check both at once. */
1772         pfnmerge = iov_pfn | phy_pfn;
1773
1774         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1775                 pages >>= VTD_STRIDE_SHIFT;
1776                 if (!pages)
1777                         break;
1778                 pfnmerge >>= VTD_STRIDE_SHIFT;
1779                 level++;
1780                 support--;
1781         }
1782         return level;
1783 }
1784
1785 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1786                             struct scatterlist *sg, unsigned long phys_pfn,
1787                             unsigned long nr_pages, int prot)
1788 {
1789         struct dma_pte *first_pte = NULL, *pte = NULL;
1790         phys_addr_t uninitialized_var(pteval);
1791         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1792         unsigned long sg_res;
1793         unsigned int largepage_lvl = 0;
1794         unsigned long lvl_pages = 0;
1795
1796         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1797
1798         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1799                 return -EINVAL;
1800
1801         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1802
1803         if (sg)
1804                 sg_res = 0;
1805         else {
1806                 sg_res = nr_pages + 1;
1807                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1808         }
1809
1810         while (nr_pages > 0) {
1811                 uint64_t tmp;
1812
1813                 if (!sg_res) {
1814                         sg_res = aligned_nrpages(sg->offset, sg->length);
1815                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1816                         sg->dma_length = sg->length;
1817                         pteval = page_to_phys(sg_page(sg)) | prot;
1818                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1819                 }
1820
1821                 if (!pte) {
1822                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1823
1824                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1825                         if (!pte)
1826                                 return -ENOMEM;
1827                         /* It is large page*/
1828                         if (largepage_lvl > 1)
1829                                 pteval |= DMA_PTE_LARGE_PAGE;
1830                         else
1831                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1832
1833                 }
1834                 /* We don't need lock here, nobody else
1835                  * touches the iova range
1836                  */
1837                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1838                 if (tmp) {
1839                         static int dumps = 5;
1840                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1841                                iov_pfn, tmp, (unsigned long long)pteval);
1842                         if (dumps) {
1843                                 dumps--;
1844                                 debug_dma_dump_mappings(NULL);
1845                         }
1846                         WARN_ON(1);
1847                 }
1848
1849                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1850
1851                 BUG_ON(nr_pages < lvl_pages);
1852                 BUG_ON(sg_res < lvl_pages);
1853
1854                 nr_pages -= lvl_pages;
1855                 iov_pfn += lvl_pages;
1856                 phys_pfn += lvl_pages;
1857                 pteval += lvl_pages * VTD_PAGE_SIZE;
1858                 sg_res -= lvl_pages;
1859
1860                 /* If the next PTE would be the first in a new page, then we
1861                    need to flush the cache on the entries we've just written.
1862                    And then we'll need to recalculate 'pte', so clear it and
1863                    let it get set again in the if (!pte) block above.
1864
1865                    If we're done (!nr_pages) we need to flush the cache too.
1866
1867                    Also if we've been setting superpages, we may need to
1868                    recalculate 'pte' and switch back to smaller pages for the
1869                    end of the mapping, if the trailing size is not enough to
1870                    use another superpage (i.e. sg_res < lvl_pages). */
1871                 pte++;
1872                 if (!nr_pages || first_pte_in_page(pte) ||
1873                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1874                         domain_flush_cache(domain, first_pte,
1875                                            (void *)pte - (void *)first_pte);
1876                         pte = NULL;
1877                 }
1878
1879                 if (!sg_res && nr_pages)
1880                         sg = sg_next(sg);
1881         }
1882         return 0;
1883 }
1884
1885 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                                     struct scatterlist *sg, unsigned long nr_pages,
1887                                     int prot)
1888 {
1889         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1890 }
1891
1892 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1893                                      unsigned long phys_pfn, unsigned long nr_pages,
1894                                      int prot)
1895 {
1896         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1897 }
1898
1899 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1900 {
1901         if (!iommu)
1902                 return;
1903
1904         clear_context_table(iommu, bus, devfn);
1905         iommu->flush.flush_context(iommu, 0, 0, 0,
1906                                            DMA_CCMD_GLOBAL_INVL);
1907         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1908 }
1909
1910 static void domain_remove_dev_info(struct dmar_domain *domain)
1911 {
1912         struct device_domain_info *info;
1913         unsigned long flags;
1914         struct intel_iommu *iommu;
1915
1916         spin_lock_irqsave(&device_domain_lock, flags);
1917         while (!list_empty(&domain->devices)) {
1918                 info = list_entry(domain->devices.next,
1919                         struct device_domain_info, link);
1920                 list_del(&info->link);
1921                 list_del(&info->global);
1922                 if (info->dev)
1923                         info->dev->dev.archdata.iommu = NULL;
1924                 spin_unlock_irqrestore(&device_domain_lock, flags);
1925
1926                 iommu_disable_dev_iotlb(info);
1927                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1928                 iommu_detach_dev(iommu, info->bus, info->devfn);
1929                 free_devinfo_mem(info);
1930
1931                 spin_lock_irqsave(&device_domain_lock, flags);
1932         }
1933         spin_unlock_irqrestore(&device_domain_lock, flags);
1934 }
1935
1936 /*
1937  * find_domain
1938  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1939  */
1940 static struct dmar_domain *
1941 find_domain(struct pci_dev *pdev)
1942 {
1943         struct device_domain_info *info;
1944
1945         /* No lock here, assumes no domain exit in normal case */
1946         info = pdev->dev.archdata.iommu;
1947         if (info)
1948                 return info->domain;
1949         return NULL;
1950 }
1951
1952 /* domain is initialized */
1953 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1954 {
1955         struct dmar_domain *domain, *found = NULL;
1956         struct intel_iommu *iommu;
1957         struct dmar_drhd_unit *drhd;
1958         struct device_domain_info *info, *tmp;
1959         struct pci_dev *dev_tmp;
1960         unsigned long flags;
1961         int bus = 0, devfn = 0;
1962         int segment;
1963         int ret;
1964
1965         domain = find_domain(pdev);
1966         if (domain)
1967                 return domain;
1968
1969         segment = pci_domain_nr(pdev->bus);
1970
1971         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1972         if (dev_tmp) {
1973                 if (pci_is_pcie(dev_tmp)) {
1974                         bus = dev_tmp->subordinate->number;
1975                         devfn = 0;
1976                 } else {
1977                         bus = dev_tmp->bus->number;
1978                         devfn = dev_tmp->devfn;
1979                 }
1980                 spin_lock_irqsave(&device_domain_lock, flags);
1981                 list_for_each_entry(info, &device_domain_list, global) {
1982                         if (info->segment == segment &&
1983                             info->bus == bus && info->devfn == devfn) {
1984                                 found = info->domain;
1985                                 break;
1986                         }
1987                 }
1988                 spin_unlock_irqrestore(&device_domain_lock, flags);
1989                 /* pcie-pci bridge already has a domain, uses it */
1990                 if (found) {
1991                         domain = found;
1992                         goto found_domain;
1993                 }
1994         }
1995
1996         domain = alloc_domain();
1997         if (!domain)
1998                 goto error;
1999
2000         /* Allocate new domain for the device */
2001         drhd = dmar_find_matched_drhd_unit(pdev);
2002         if (!drhd) {
2003                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2004                         pci_name(pdev));
2005                 return NULL;
2006         }
2007         iommu = drhd->iommu;
2008
2009         ret = iommu_attach_domain(domain, iommu);
2010         if (ret) {
2011                 free_domain_mem(domain);
2012                 goto error;
2013         }
2014
2015         if (domain_init(domain, gaw)) {
2016                 domain_exit(domain);
2017                 goto error;
2018         }
2019
2020         /* register pcie-to-pci device */
2021         if (dev_tmp) {
2022                 info = alloc_devinfo_mem();
2023                 if (!info) {
2024                         domain_exit(domain);
2025                         goto error;
2026                 }
2027                 info->segment = segment;
2028                 info->bus = bus;
2029                 info->devfn = devfn;
2030                 info->dev = NULL;
2031                 info->domain = domain;
2032                 /* This domain is shared by devices under p2p bridge */
2033                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2034
2035                 /* pcie-to-pci bridge already has a domain, uses it */
2036                 found = NULL;
2037                 spin_lock_irqsave(&device_domain_lock, flags);
2038                 list_for_each_entry(tmp, &device_domain_list, global) {
2039                         if (tmp->segment == segment &&
2040                             tmp->bus == bus && tmp->devfn == devfn) {
2041                                 found = tmp->domain;
2042                                 break;
2043                         }
2044                 }
2045                 if (found) {
2046                         spin_unlock_irqrestore(&device_domain_lock, flags);
2047                         free_devinfo_mem(info);
2048                         domain_exit(domain);
2049                         domain = found;
2050                 } else {
2051                         list_add(&info->link, &domain->devices);
2052                         list_add(&info->global, &device_domain_list);
2053                         spin_unlock_irqrestore(&device_domain_lock, flags);
2054                 }
2055         }
2056
2057 found_domain:
2058         info = alloc_devinfo_mem();
2059         if (!info)
2060                 goto error;
2061         info->segment = segment;
2062         info->bus = pdev->bus->number;
2063         info->devfn = pdev->devfn;
2064         info->dev = pdev;
2065         info->domain = domain;
2066         spin_lock_irqsave(&device_domain_lock, flags);
2067         /* somebody is fast */
2068         found = find_domain(pdev);
2069         if (found != NULL) {
2070                 spin_unlock_irqrestore(&device_domain_lock, flags);
2071                 if (found != domain) {
2072                         domain_exit(domain);
2073                         domain = found;
2074                 }
2075                 free_devinfo_mem(info);
2076                 return domain;
2077         }
2078         list_add(&info->link, &domain->devices);
2079         list_add(&info->global, &device_domain_list);
2080         pdev->dev.archdata.iommu = info;
2081         spin_unlock_irqrestore(&device_domain_lock, flags);
2082         return domain;
2083 error:
2084         /* recheck it here, maybe others set it */
2085         return find_domain(pdev);
2086 }
2087
2088 static int iommu_identity_mapping;
2089 #define IDENTMAP_ALL            1
2090 #define IDENTMAP_GFX            2
2091 #define IDENTMAP_AZALIA         4
2092
2093 static int iommu_domain_identity_map(struct dmar_domain *domain,
2094                                      unsigned long long start,
2095                                      unsigned long long end)
2096 {
2097         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2098         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2099
2100         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2101                           dma_to_mm_pfn(last_vpfn))) {
2102                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2103                 return -ENOMEM;
2104         }
2105
2106         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2107                  start, end, domain->id);
2108         /*
2109          * RMRR range might have overlap with physical memory range,
2110          * clear it first
2111          */
2112         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2113
2114         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2115                                   last_vpfn - first_vpfn + 1,
2116                                   DMA_PTE_READ|DMA_PTE_WRITE);
2117 }
2118
2119 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2120                                       unsigned long long start,
2121                                       unsigned long long end)
2122 {
2123         struct dmar_domain *domain;
2124         int ret;
2125
2126         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2127         if (!domain)
2128                 return -ENOMEM;
2129
2130         /* For _hardware_ passthrough, don't bother. But for software
2131            passthrough, we do it anyway -- it may indicate a memory
2132            range which is reserved in E820, so which didn't get set
2133            up to start with in si_domain */
2134         if (domain == si_domain && hw_pass_through) {
2135                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2136                        pci_name(pdev), start, end);
2137                 return 0;
2138         }
2139
2140         printk(KERN_INFO
2141                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2142                pci_name(pdev), start, end);
2143         
2144         if (end < start) {
2145                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2146                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2147                         dmi_get_system_info(DMI_BIOS_VENDOR),
2148                         dmi_get_system_info(DMI_BIOS_VERSION),
2149                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2150                 ret = -EIO;
2151                 goto error;
2152         }
2153
2154         if (end >> agaw_to_width(domain->agaw)) {
2155                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2156                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2157                      agaw_to_width(domain->agaw),
2158                      dmi_get_system_info(DMI_BIOS_VENDOR),
2159                      dmi_get_system_info(DMI_BIOS_VERSION),
2160                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2161                 ret = -EIO;
2162                 goto error;
2163         }
2164
2165         ret = iommu_domain_identity_map(domain, start, end);
2166         if (ret)
2167                 goto error;
2168
2169         /* context entry init */
2170         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2171         if (ret)
2172                 goto error;
2173
2174         return 0;
2175
2176  error:
2177         domain_exit(domain);
2178         return ret;
2179 }
2180
2181 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2182         struct pci_dev *pdev)
2183 {
2184         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2185                 return 0;
2186         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2187                 rmrr->end_address);
2188 }
2189
2190 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2191 static inline void iommu_prepare_isa(void)
2192 {
2193         struct pci_dev *pdev;
2194         int ret;
2195
2196         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2197         if (!pdev)
2198                 return;
2199
2200         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2201         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2202
2203         if (ret)
2204                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2205                        "floppy might not work\n");
2206
2207 }
2208 #else
2209 static inline void iommu_prepare_isa(void)
2210 {
2211         return;
2212 }
2213 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2214
2215 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2216
2217 static int __init si_domain_init(int hw)
2218 {
2219         struct dmar_drhd_unit *drhd;
2220         struct intel_iommu *iommu;
2221         int nid, ret = 0;
2222
2223         si_domain = alloc_domain();
2224         if (!si_domain)
2225                 return -EFAULT;
2226
2227         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2228
2229         for_each_active_iommu(iommu, drhd) {
2230                 ret = iommu_attach_domain(si_domain, iommu);
2231                 if (ret) {
2232                         domain_exit(si_domain);
2233                         return -EFAULT;
2234                 }
2235         }
2236
2237         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2238                 domain_exit(si_domain);
2239                 return -EFAULT;
2240         }
2241
2242         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2243
2244         if (hw)
2245                 return 0;
2246
2247         for_each_online_node(nid) {
2248                 unsigned long start_pfn, end_pfn;
2249                 int i;
2250
2251                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2252                         ret = iommu_domain_identity_map(si_domain,
2253                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2254                         if (ret)
2255                                 return ret;
2256                 }
2257         }
2258
2259         return 0;
2260 }
2261
2262 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2263                                           struct pci_dev *pdev);
2264 static int identity_mapping(struct pci_dev *pdev)
2265 {
2266         struct device_domain_info *info;
2267
2268         if (likely(!iommu_identity_mapping))
2269                 return 0;
2270
2271         info = pdev->dev.archdata.iommu;
2272         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2273                 return (info->domain == si_domain);
2274
2275         return 0;
2276 }
2277
2278 static int domain_add_dev_info(struct dmar_domain *domain,
2279                                struct pci_dev *pdev,
2280                                int translation)
2281 {
2282         struct device_domain_info *info;
2283         unsigned long flags;
2284         int ret;
2285
2286         info = alloc_devinfo_mem();
2287         if (!info)
2288                 return -ENOMEM;
2289
2290         info->segment = pci_domain_nr(pdev->bus);
2291         info->bus = pdev->bus->number;
2292         info->devfn = pdev->devfn;
2293         info->dev = pdev;
2294         info->domain = domain;
2295
2296         spin_lock_irqsave(&device_domain_lock, flags);
2297         list_add(&info->link, &domain->devices);
2298         list_add(&info->global, &device_domain_list);
2299         pdev->dev.archdata.iommu = info;
2300         spin_unlock_irqrestore(&device_domain_lock, flags);
2301
2302         ret = domain_context_mapping(domain, pdev, translation);
2303         if (ret) {
2304                 spin_lock_irqsave(&device_domain_lock, flags);
2305                 list_del(&info->link);
2306                 list_del(&info->global);
2307                 pdev->dev.archdata.iommu = NULL;
2308                 spin_unlock_irqrestore(&device_domain_lock, flags);
2309                 free_devinfo_mem(info);
2310                 return ret;
2311         }
2312
2313         return 0;
2314 }
2315
2316 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2317 {
2318         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2319                 return 1;
2320
2321         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2322                 return 1;
2323
2324         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2325                 return 0;
2326
2327         /*
2328          * We want to start off with all devices in the 1:1 domain, and
2329          * take them out later if we find they can't access all of memory.
2330          *
2331          * However, we can't do this for PCI devices behind bridges,
2332          * because all PCI devices behind the same bridge will end up
2333          * with the same source-id on their transactions.
2334          *
2335          * Practically speaking, we can't change things around for these
2336          * devices at run-time, because we can't be sure there'll be no
2337          * DMA transactions in flight for any of their siblings.
2338          * 
2339          * So PCI devices (unless they're on the root bus) as well as
2340          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2341          * the 1:1 domain, just in _case_ one of their siblings turns out
2342          * not to be able to map all of memory.
2343          */
2344         if (!pci_is_pcie(pdev)) {
2345                 if (!pci_is_root_bus(pdev->bus))
2346                         return 0;
2347                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2348                         return 0;
2349         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2350                 return 0;
2351
2352         /* 
2353          * At boot time, we don't yet know if devices will be 64-bit capable.
2354          * Assume that they will -- if they turn out not to be, then we can 
2355          * take them out of the 1:1 domain later.
2356          */
2357         if (!startup) {
2358                 /*
2359                  * If the device's dma_mask is less than the system's memory
2360                  * size then this is not a candidate for identity mapping.
2361                  */
2362                 u64 dma_mask = pdev->dma_mask;
2363
2364                 if (pdev->dev.coherent_dma_mask &&
2365                     pdev->dev.coherent_dma_mask < dma_mask)
2366                         dma_mask = pdev->dev.coherent_dma_mask;
2367
2368                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2369         }
2370
2371         return 1;
2372 }
2373
2374 static int __init iommu_prepare_static_identity_mapping(int hw)
2375 {
2376         struct pci_dev *pdev = NULL;
2377         int ret;
2378
2379         ret = si_domain_init(hw);
2380         if (ret)
2381                 return -EFAULT;
2382
2383         for_each_pci_dev(pdev) {
2384                 if (iommu_should_identity_map(pdev, 1)) {
2385                         ret = domain_add_dev_info(si_domain, pdev,
2386                                              hw ? CONTEXT_TT_PASS_THROUGH :
2387                                                   CONTEXT_TT_MULTI_LEVEL);
2388                         if (ret) {
2389                                 /* device not associated with an iommu */
2390                                 if (ret == -ENODEV)
2391                                         continue;
2392                                 return ret;
2393                         }
2394                         pr_info("IOMMU: %s identity mapping for device %s\n",
2395                                 hw ? "hardware" : "software", pci_name(pdev));
2396                 }
2397         }
2398
2399         return 0;
2400 }
2401
2402 static int __init init_dmars(void)
2403 {
2404         struct dmar_drhd_unit *drhd;
2405         struct dmar_rmrr_unit *rmrr;
2406         struct pci_dev *pdev;
2407         struct intel_iommu *iommu;
2408         int i, ret;
2409
2410         /*
2411          * for each drhd
2412          *    allocate root
2413          *    initialize and program root entry to not present
2414          * endfor
2415          */
2416         for_each_drhd_unit(drhd) {
2417                 /*
2418                  * lock not needed as this is only incremented in the single
2419                  * threaded kernel __init code path all other access are read
2420                  * only
2421                  */
2422                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2423                         g_num_of_iommus++;
2424                         continue;
2425                 }
2426                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2427                           IOMMU_UNITS_SUPPORTED);
2428         }
2429
2430         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2431                         GFP_KERNEL);
2432         if (!g_iommus) {
2433                 printk(KERN_ERR "Allocating global iommu array failed\n");
2434                 ret = -ENOMEM;
2435                 goto error;
2436         }
2437
2438         deferred_flush = kzalloc(g_num_of_iommus *
2439                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2440         if (!deferred_flush) {
2441                 ret = -ENOMEM;
2442                 goto error;
2443         }
2444
2445         for_each_drhd_unit(drhd) {
2446                 if (drhd->ignored)
2447                         continue;
2448
2449                 iommu = drhd->iommu;
2450                 g_iommus[iommu->seq_id] = iommu;
2451
2452                 ret = iommu_init_domains(iommu);
2453                 if (ret)
2454                         goto error;
2455
2456                 /*
2457                  * TBD:
2458                  * we could share the same root & context tables
2459                  * among all IOMMU's. Need to Split it later.
2460                  */
2461                 ret = iommu_alloc_root_entry(iommu);
2462                 if (ret) {
2463                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2464                         goto error;
2465                 }
2466                 if (!ecap_pass_through(iommu->ecap))
2467                         hw_pass_through = 0;
2468         }
2469
2470         /*
2471          * Start from the sane iommu hardware state.
2472          */
2473         for_each_drhd_unit(drhd) {
2474                 if (drhd->ignored)
2475                         continue;
2476
2477                 iommu = drhd->iommu;
2478
2479                 /*
2480                  * If the queued invalidation is already initialized by us
2481                  * (for example, while enabling interrupt-remapping) then
2482                  * we got the things already rolling from a sane state.
2483                  */
2484                 if (iommu->qi)
2485                         continue;
2486
2487                 /*
2488                  * Clear any previous faults.
2489                  */
2490                 dmar_fault(-1, iommu);
2491                 /*
2492                  * Disable queued invalidation if supported and already enabled
2493                  * before OS handover.
2494                  */
2495                 dmar_disable_qi(iommu);
2496         }
2497
2498         for_each_drhd_unit(drhd) {
2499                 if (drhd->ignored)
2500                         continue;
2501
2502                 iommu = drhd->iommu;
2503
2504                 if (dmar_enable_qi(iommu)) {
2505                         /*
2506                          * Queued Invalidate not enabled, use Register Based
2507                          * Invalidate
2508                          */
2509                         iommu->flush.flush_context = __iommu_flush_context;
2510                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2511                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2512                                "invalidation\n",
2513                                 iommu->seq_id,
2514                                (unsigned long long)drhd->reg_base_addr);
2515                 } else {
2516                         iommu->flush.flush_context = qi_flush_context;
2517                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2518                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2519                                "invalidation\n",
2520                                 iommu->seq_id,
2521                                (unsigned long long)drhd->reg_base_addr);
2522                 }
2523         }
2524
2525         if (iommu_pass_through)
2526                 iommu_identity_mapping |= IDENTMAP_ALL;
2527
2528 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2529         iommu_identity_mapping |= IDENTMAP_GFX;
2530 #endif
2531
2532         check_tylersburg_isoch();
2533
2534         /*
2535          * If pass through is not set or not enabled, setup context entries for
2536          * identity mappings for rmrr, gfx, and isa and may fall back to static
2537          * identity mapping if iommu_identity_mapping is set.
2538          */
2539         if (iommu_identity_mapping) {
2540                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2541                 if (ret) {
2542                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2543                         goto error;
2544                 }
2545         }
2546         /*
2547          * For each rmrr
2548          *   for each dev attached to rmrr
2549          *   do
2550          *     locate drhd for dev, alloc domain for dev
2551          *     allocate free domain
2552          *     allocate page table entries for rmrr
2553          *     if context not allocated for bus
2554          *           allocate and init context
2555          *           set present in root table for this bus
2556          *     init context with domain, translation etc
2557          *    endfor
2558          * endfor
2559          */
2560         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2561         for_each_rmrr_units(rmrr) {
2562                 for (i = 0; i < rmrr->devices_cnt; i++) {
2563                         pdev = rmrr->devices[i];
2564                         /*
2565                          * some BIOS lists non-exist devices in DMAR
2566                          * table.
2567                          */
2568                         if (!pdev)
2569                                 continue;
2570                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2571                         if (ret)
2572                                 printk(KERN_ERR
2573                                        "IOMMU: mapping reserved region failed\n");
2574                 }
2575         }
2576
2577         iommu_prepare_isa();
2578
2579         /*
2580          * for each drhd
2581          *   enable fault log
2582          *   global invalidate context cache
2583          *   global invalidate iotlb
2584          *   enable translation
2585          */
2586         for_each_drhd_unit(drhd) {
2587                 if (drhd->ignored) {
2588                         /*
2589                          * we always have to disable PMRs or DMA may fail on
2590                          * this device
2591                          */
2592                         if (force_on)
2593                                 iommu_disable_protect_mem_regions(drhd->iommu);
2594                         continue;
2595                 }
2596                 iommu = drhd->iommu;
2597
2598                 iommu_flush_write_buffer(iommu);
2599
2600                 ret = dmar_set_interrupt(iommu);
2601                 if (ret)
2602                         goto error;
2603
2604                 iommu_set_root_entry(iommu);
2605
2606                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2607                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2608
2609                 ret = iommu_enable_translation(iommu);
2610                 if (ret)
2611                         goto error;
2612
2613                 iommu_disable_protect_mem_regions(iommu);
2614         }
2615
2616         return 0;
2617 error:
2618         for_each_drhd_unit(drhd) {
2619                 if (drhd->ignored)
2620                         continue;
2621                 iommu = drhd->iommu;
2622                 free_iommu(iommu);
2623         }
2624         kfree(g_iommus);
2625         return ret;
2626 }
2627
2628 /* This takes a number of _MM_ pages, not VTD pages */
2629 static struct iova *intel_alloc_iova(struct device *dev,
2630                                      struct dmar_domain *domain,
2631                                      unsigned long nrpages, uint64_t dma_mask)
2632 {
2633         struct pci_dev *pdev = to_pci_dev(dev);
2634         struct iova *iova = NULL;
2635
2636         /* Restrict dma_mask to the width that the iommu can handle */
2637         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2638
2639         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2640                 /*
2641                  * First try to allocate an io virtual address in
2642                  * DMA_BIT_MASK(32) and if that fails then try allocating
2643                  * from higher range
2644                  */
2645                 iova = alloc_iova(&domain->iovad, nrpages,
2646                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2647                 if (iova)
2648                         return iova;
2649         }
2650         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2651         if (unlikely(!iova)) {
2652                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2653                        nrpages, pci_name(pdev));
2654                 return NULL;
2655         }
2656
2657         return iova;
2658 }
2659
2660 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2661 {
2662         struct dmar_domain *domain;
2663         int ret;
2664
2665         domain = get_domain_for_dev(pdev,
2666                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2667         if (!domain) {
2668                 printk(KERN_ERR
2669                         "Allocating domain for %s failed", pci_name(pdev));
2670                 return NULL;
2671         }
2672
2673         /* make sure context mapping is ok */
2674         if (unlikely(!domain_context_mapped(pdev))) {
2675                 ret = domain_context_mapping(domain, pdev,
2676                                              CONTEXT_TT_MULTI_LEVEL);
2677                 if (ret) {
2678                         printk(KERN_ERR
2679                                 "Domain context map for %s failed",
2680                                 pci_name(pdev));
2681                         return NULL;
2682                 }
2683         }
2684
2685         return domain;
2686 }
2687
2688 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2689 {
2690         struct device_domain_info *info;
2691
2692         /* No lock here, assumes no domain exit in normal case */
2693         info = dev->dev.archdata.iommu;
2694         if (likely(info))
2695                 return info->domain;
2696
2697         return __get_valid_domain_for_dev(dev);
2698 }
2699
2700 static int iommu_dummy(struct pci_dev *pdev)
2701 {
2702         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2703 }
2704
2705 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2706 static int iommu_no_mapping(struct device *dev)
2707 {
2708         struct pci_dev *pdev;
2709         int found;
2710
2711         if (unlikely(dev->bus != &pci_bus_type))
2712                 return 1;
2713
2714         pdev = to_pci_dev(dev);
2715         if (iommu_dummy(pdev))
2716                 return 1;
2717
2718         if (!iommu_identity_mapping)
2719                 return 0;
2720
2721         found = identity_mapping(pdev);
2722         if (found) {
2723                 if (iommu_should_identity_map(pdev, 0))
2724                         return 1;
2725                 else {
2726                         /*
2727                          * 32 bit DMA is removed from si_domain and fall back
2728                          * to non-identity mapping.
2729                          */
2730                         domain_remove_one_dev_info(si_domain, pdev);
2731                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2732                                pci_name(pdev));
2733                         return 0;
2734                 }
2735         } else {
2736                 /*
2737                  * In case of a detached 64 bit DMA device from vm, the device
2738                  * is put into si_domain for identity mapping.
2739                  */
2740                 if (iommu_should_identity_map(pdev, 0)) {
2741                         int ret;
2742                         ret = domain_add_dev_info(si_domain, pdev,
2743                                                   hw_pass_through ?
2744                                                   CONTEXT_TT_PASS_THROUGH :
2745                                                   CONTEXT_TT_MULTI_LEVEL);
2746                         if (!ret) {
2747                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2748                                        pci_name(pdev));
2749                                 return 1;
2750                         }
2751                 }
2752         }
2753
2754         return 0;
2755 }
2756
2757 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2758                                      size_t size, int dir, u64 dma_mask)
2759 {
2760         struct pci_dev *pdev = to_pci_dev(hwdev);
2761         struct dmar_domain *domain;
2762         phys_addr_t start_paddr;
2763         struct iova *iova;
2764         int prot = 0;
2765         int ret;
2766         struct intel_iommu *iommu;
2767         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2768
2769         BUG_ON(dir == DMA_NONE);
2770
2771         if (iommu_no_mapping(hwdev))
2772                 return paddr;
2773
2774         domain = get_valid_domain_for_dev(pdev);
2775         if (!domain)
2776                 return 0;
2777
2778         iommu = domain_get_iommu(domain);
2779         size = aligned_nrpages(paddr, size);
2780
2781         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2782         if (!iova)
2783                 goto error;
2784
2785         /*
2786          * Check if DMAR supports zero-length reads on write only
2787          * mappings..
2788          */
2789         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2790                         !cap_zlr(iommu->cap))
2791                 prot |= DMA_PTE_READ;
2792         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2793                 prot |= DMA_PTE_WRITE;
2794         /*
2795          * paddr - (paddr + size) might be partial page, we should map the whole
2796          * page.  Note: if two part of one page are separately mapped, we
2797          * might have two guest_addr mapping to the same host paddr, but this
2798          * is not a big problem
2799          */
2800         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2801                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2802         if (ret)
2803                 goto error;
2804
2805         /* it's a non-present to present mapping. Only flush if caching mode */
2806         if (cap_caching_mode(iommu->cap))
2807                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2808         else
2809                 iommu_flush_write_buffer(iommu);
2810
2811         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2812         start_paddr += paddr & ~PAGE_MASK;
2813         return start_paddr;
2814
2815 error:
2816         if (iova)
2817                 __free_iova(&domain->iovad, iova);
2818         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2819                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2820         return 0;
2821 }
2822
2823 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2824                                  unsigned long offset, size_t size,
2825                                  enum dma_data_direction dir,
2826                                  struct dma_attrs *attrs)
2827 {
2828         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2829                                   dir, to_pci_dev(dev)->dma_mask);
2830 }
2831
2832 static void flush_unmaps(void)
2833 {
2834         int i, j;
2835
2836         timer_on = 0;
2837
2838         /* just flush them all */
2839         for (i = 0; i < g_num_of_iommus; i++) {
2840                 struct intel_iommu *iommu = g_iommus[i];
2841                 if (!iommu)
2842                         continue;
2843
2844                 if (!deferred_flush[i].next)
2845                         continue;
2846
2847                 /* In caching mode, global flushes turn emulation expensive */
2848                 if (!cap_caching_mode(iommu->cap))
2849                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2850                                          DMA_TLB_GLOBAL_FLUSH);
2851                 for (j = 0; j < deferred_flush[i].next; j++) {
2852                         unsigned long mask;
2853                         struct iova *iova = deferred_flush[i].iova[j];
2854                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2855
2856                         /* On real hardware multiple invalidations are expensive */
2857                         if (cap_caching_mode(iommu->cap))
2858                                 iommu_flush_iotlb_psi(iommu, domain->id,
2859                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2860                         else {
2861                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2862                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2863                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2864                         }
2865                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2866                 }
2867                 deferred_flush[i].next = 0;
2868         }
2869
2870         list_size = 0;
2871 }
2872
2873 static void flush_unmaps_timeout(unsigned long data)
2874 {
2875         unsigned long flags;
2876
2877         spin_lock_irqsave(&async_umap_flush_lock, flags);
2878         flush_unmaps();
2879         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2880 }
2881
2882 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2883 {
2884         unsigned long flags;
2885         int next, iommu_id;
2886         struct intel_iommu *iommu;
2887
2888         spin_lock_irqsave(&async_umap_flush_lock, flags);
2889         if (list_size == HIGH_WATER_MARK)
2890                 flush_unmaps();
2891
2892         iommu = domain_get_iommu(dom);
2893         iommu_id = iommu->seq_id;
2894
2895         next = deferred_flush[iommu_id].next;
2896         deferred_flush[iommu_id].domain[next] = dom;
2897         deferred_flush[iommu_id].iova[next] = iova;
2898         deferred_flush[iommu_id].next++;
2899
2900         if (!timer_on) {
2901                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2902                 timer_on = 1;
2903         }
2904         list_size++;
2905         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2906 }
2907
2908 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2909                              size_t size, enum dma_data_direction dir,
2910                              struct dma_attrs *attrs)
2911 {
2912         struct pci_dev *pdev = to_pci_dev(dev);
2913         struct dmar_domain *domain;
2914         unsigned long start_pfn, last_pfn;
2915         struct iova *iova;
2916         struct intel_iommu *iommu;
2917
2918         if (iommu_no_mapping(dev))
2919                 return;
2920
2921         domain = find_domain(pdev);
2922         BUG_ON(!domain);
2923
2924         iommu = domain_get_iommu(domain);
2925
2926         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2927         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2928                       (unsigned long long)dev_addr))
2929                 return;
2930
2931         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2932         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2933
2934         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2935                  pci_name(pdev), start_pfn, last_pfn);
2936
2937         /*  clear the whole page */
2938         dma_pte_clear_range(domain, start_pfn, last_pfn);
2939
2940         /* free page tables */
2941         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2942
2943         if (intel_iommu_strict) {
2944                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2945                                       last_pfn - start_pfn + 1, 0);
2946                 /* free iova */
2947                 __free_iova(&domain->iovad, iova);
2948         } else {
2949                 add_unmap(domain, iova);
2950                 /*
2951                  * queue up the release of the unmap to save the 1/6th of the
2952                  * cpu used up by the iotlb flush operation...
2953                  */
2954         }
2955 }
2956
2957 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2958                                   dma_addr_t *dma_handle, gfp_t flags,
2959                                   struct dma_attrs *attrs)
2960 {
2961         void *vaddr;
2962         int order;
2963
2964         size = PAGE_ALIGN(size);
2965         order = get_order(size);
2966
2967         if (!iommu_no_mapping(hwdev))
2968                 flags &= ~(GFP_DMA | GFP_DMA32);
2969         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2970                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2971                         flags |= GFP_DMA;
2972                 else
2973                         flags |= GFP_DMA32;
2974         }
2975
2976         vaddr = (void *)__get_free_pages(flags, order);
2977         if (!vaddr)
2978                 return NULL;
2979         memset(vaddr, 0, size);
2980
2981         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2982                                          DMA_BIDIRECTIONAL,
2983                                          hwdev->coherent_dma_mask);
2984         if (*dma_handle)
2985                 return vaddr;
2986         free_pages((unsigned long)vaddr, order);
2987         return NULL;
2988 }
2989
2990 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2991                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
2992 {
2993         int order;
2994
2995         size = PAGE_ALIGN(size);
2996         order = get_order(size);
2997
2998         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2999         free_pages((unsigned long)vaddr, order);
3000 }
3001
3002 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3003                            int nelems, enum dma_data_direction dir,
3004                            struct dma_attrs *attrs)
3005 {
3006         struct pci_dev *pdev = to_pci_dev(hwdev);
3007         struct dmar_domain *domain;
3008         unsigned long start_pfn, last_pfn;
3009         struct iova *iova;
3010         struct intel_iommu *iommu;
3011
3012         if (iommu_no_mapping(hwdev))
3013                 return;
3014
3015         domain = find_domain(pdev);
3016         BUG_ON(!domain);
3017
3018         iommu = domain_get_iommu(domain);
3019
3020         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3021         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3022                       (unsigned long long)sglist[0].dma_address))
3023                 return;
3024
3025         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3026         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3027
3028         /*  clear the whole page */
3029         dma_pte_clear_range(domain, start_pfn, last_pfn);
3030
3031         /* free page tables */
3032         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3033
3034         if (intel_iommu_strict) {
3035                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3036                                       last_pfn - start_pfn + 1, 0);
3037                 /* free iova */
3038                 __free_iova(&domain->iovad, iova);
3039         } else {
3040                 add_unmap(domain, iova);
3041                 /*
3042                  * queue up the release of the unmap to save the 1/6th of the
3043                  * cpu used up by the iotlb flush operation...
3044                  */
3045         }
3046 }
3047
3048 static int intel_nontranslate_map_sg(struct device *hddev,
3049         struct scatterlist *sglist, int nelems, int dir)
3050 {
3051         int i;
3052         struct scatterlist *sg;
3053
3054         for_each_sg(sglist, sg, nelems, i) {
3055                 BUG_ON(!sg_page(sg));
3056                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3057                 sg->dma_length = sg->length;
3058         }
3059         return nelems;
3060 }
3061
3062 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3063                         enum dma_data_direction dir, struct dma_attrs *attrs)
3064 {
3065         int i;
3066         struct pci_dev *pdev = to_pci_dev(hwdev);
3067         struct dmar_domain *domain;
3068         size_t size = 0;
3069         int prot = 0;
3070         struct iova *iova = NULL;
3071         int ret;
3072         struct scatterlist *sg;
3073         unsigned long start_vpfn;
3074         struct intel_iommu *iommu;
3075
3076         BUG_ON(dir == DMA_NONE);
3077         if (iommu_no_mapping(hwdev))
3078                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3079
3080         domain = get_valid_domain_for_dev(pdev);
3081         if (!domain)
3082                 return 0;
3083
3084         iommu = domain_get_iommu(domain);
3085
3086         for_each_sg(sglist, sg, nelems, i)
3087                 size += aligned_nrpages(sg->offset, sg->length);
3088
3089         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3090                                 pdev->dma_mask);
3091         if (!iova) {
3092                 sglist->dma_length = 0;
3093                 return 0;
3094         }
3095
3096         /*
3097          * Check if DMAR supports zero-length reads on write only
3098          * mappings..
3099          */
3100         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3101                         !cap_zlr(iommu->cap))
3102                 prot |= DMA_PTE_READ;
3103         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3104                 prot |= DMA_PTE_WRITE;
3105
3106         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3107
3108         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3109         if (unlikely(ret)) {
3110                 /*  clear the page */
3111                 dma_pte_clear_range(domain, start_vpfn,
3112                                     start_vpfn + size - 1);
3113                 /* free page tables */
3114                 dma_pte_free_pagetable(domain, start_vpfn,
3115                                        start_vpfn + size - 1);
3116                 /* free iova */
3117                 __free_iova(&domain->iovad, iova);
3118                 return 0;
3119         }
3120
3121         /* it's a non-present to present mapping. Only flush if caching mode */
3122         if (cap_caching_mode(iommu->cap))
3123                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3124         else
3125                 iommu_flush_write_buffer(iommu);
3126
3127         return nelems;
3128 }
3129
3130 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3131 {
3132         return !dma_addr;
3133 }
3134
3135 struct dma_map_ops intel_dma_ops = {
3136         .alloc = intel_alloc_coherent,
3137         .free = intel_free_coherent,
3138         .map_sg = intel_map_sg,
3139         .unmap_sg = intel_unmap_sg,
3140         .map_page = intel_map_page,
3141         .unmap_page = intel_unmap_page,
3142         .mapping_error = intel_mapping_error,
3143 };
3144
3145 static inline int iommu_domain_cache_init(void)
3146 {
3147         int ret = 0;
3148
3149         iommu_domain_cache = kmem_cache_create("iommu_domain",
3150                                          sizeof(struct dmar_domain),
3151                                          0,
3152                                          SLAB_HWCACHE_ALIGN,
3153
3154                                          NULL);
3155         if (!iommu_domain_cache) {
3156                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3157                 ret = -ENOMEM;
3158         }
3159
3160         return ret;
3161 }
3162
3163 static inline int iommu_devinfo_cache_init(void)
3164 {
3165         int ret = 0;
3166
3167         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3168                                          sizeof(struct device_domain_info),
3169                                          0,
3170                                          SLAB_HWCACHE_ALIGN,
3171                                          NULL);
3172         if (!iommu_devinfo_cache) {
3173                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3174                 ret = -ENOMEM;
3175         }
3176
3177         return ret;
3178 }
3179
3180 static inline int iommu_iova_cache_init(void)
3181 {
3182         int ret = 0;
3183
3184         iommu_iova_cache = kmem_cache_create("iommu_iova",
3185                                          sizeof(struct iova),
3186                                          0,
3187                                          SLAB_HWCACHE_ALIGN,
3188                                          NULL);
3189         if (!iommu_iova_cache) {
3190                 printk(KERN_ERR "Couldn't create iova cache\n");
3191                 ret = -ENOMEM;
3192         }
3193
3194         return ret;
3195 }
3196
3197 static int __init iommu_init_mempool(void)
3198 {
3199         int ret;
3200         ret = iommu_iova_cache_init();
3201         if (ret)
3202                 return ret;
3203
3204         ret = iommu_domain_cache_init();
3205         if (ret)
3206                 goto domain_error;
3207
3208         ret = iommu_devinfo_cache_init();
3209         if (!ret)
3210                 return ret;
3211
3212         kmem_cache_destroy(iommu_domain_cache);
3213 domain_error:
3214         kmem_cache_destroy(iommu_iova_cache);
3215
3216         return -ENOMEM;
3217 }
3218
3219 static void __init iommu_exit_mempool(void)
3220 {
3221         kmem_cache_destroy(iommu_devinfo_cache);
3222         kmem_cache_destroy(iommu_domain_cache);
3223         kmem_cache_destroy(iommu_iova_cache);
3224
3225 }
3226
3227 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3228 {
3229         struct dmar_drhd_unit *drhd;
3230         u32 vtbar;
3231         int rc;
3232
3233         /* We know that this device on this chipset has its own IOMMU.
3234          * If we find it under a different IOMMU, then the BIOS is lying
3235          * to us. Hope that the IOMMU for this device is actually
3236          * disabled, and it needs no translation...
3237          */
3238         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3239         if (rc) {
3240                 /* "can't" happen */
3241                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3242                 return;
3243         }
3244         vtbar &= 0xffff0000;
3245
3246         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3247         drhd = dmar_find_matched_drhd_unit(pdev);
3248         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3249                             TAINT_FIRMWARE_WORKAROUND,
3250                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3251                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3252 }
3253 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3254
3255 static void __init init_no_remapping_devices(void)
3256 {
3257         struct dmar_drhd_unit *drhd;
3258
3259         for_each_drhd_unit(drhd) {
3260                 if (!drhd->include_all) {
3261                         int i;
3262                         for (i = 0; i < drhd->devices_cnt; i++)
3263                                 if (drhd->devices[i] != NULL)
3264                                         break;
3265                         /* ignore DMAR unit if no pci devices exist */
3266                         if (i == drhd->devices_cnt)
3267                                 drhd->ignored = 1;
3268                 }
3269         }
3270
3271         for_each_drhd_unit(drhd) {
3272                 int i;
3273                 if (drhd->ignored || drhd->include_all)
3274                         continue;
3275
3276                 for (i = 0; i < drhd->devices_cnt; i++)
3277                         if (drhd->devices[i] &&
3278                             !IS_GFX_DEVICE(drhd->devices[i]))
3279                                 break;
3280
3281                 if (i < drhd->devices_cnt)
3282                         continue;
3283
3284                 /* This IOMMU has *only* gfx devices. Either bypass it or
3285                    set the gfx_mapped flag, as appropriate */
3286                 if (dmar_map_gfx) {
3287                         intel_iommu_gfx_mapped = 1;
3288                 } else {
3289                         drhd->ignored = 1;
3290                         for (i = 0; i < drhd->devices_cnt; i++) {
3291                                 if (!drhd->devices[i])
3292                                         continue;
3293                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3294                         }
3295                 }
3296         }
3297 }
3298
3299 #ifdef CONFIG_SUSPEND
3300 static int init_iommu_hw(void)
3301 {
3302         struct dmar_drhd_unit *drhd;
3303         struct intel_iommu *iommu = NULL;
3304
3305         for_each_active_iommu(iommu, drhd)
3306                 if (iommu->qi)
3307                         dmar_reenable_qi(iommu);
3308
3309         for_each_iommu(iommu, drhd) {
3310                 if (drhd->ignored) {
3311                         /*
3312                          * we always have to disable PMRs or DMA may fail on
3313                          * this device
3314                          */
3315                         if (force_on)
3316                                 iommu_disable_protect_mem_regions(iommu);
3317                         continue;
3318                 }
3319         
3320                 iommu_flush_write_buffer(iommu);
3321
3322                 iommu_set_root_entry(iommu);
3323
3324                 iommu->flush.flush_context(iommu, 0, 0, 0,
3325                                            DMA_CCMD_GLOBAL_INVL);
3326                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3327                                          DMA_TLB_GLOBAL_FLUSH);
3328                 if (iommu_enable_translation(iommu))
3329                         return 1;
3330                 iommu_disable_protect_mem_regions(iommu);
3331         }
3332
3333         return 0;
3334 }
3335
3336 static void iommu_flush_all(void)
3337 {
3338         struct dmar_drhd_unit *drhd;
3339         struct intel_iommu *iommu;
3340
3341         for_each_active_iommu(iommu, drhd) {
3342                 iommu->flush.flush_context(iommu, 0, 0, 0,
3343                                            DMA_CCMD_GLOBAL_INVL);
3344                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3345                                          DMA_TLB_GLOBAL_FLUSH);
3346         }
3347 }
3348
3349 static int iommu_suspend(void)
3350 {
3351         struct dmar_drhd_unit *drhd;
3352         struct intel_iommu *iommu = NULL;
3353         unsigned long flag;
3354
3355         for_each_active_iommu(iommu, drhd) {
3356                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3357                                                  GFP_ATOMIC);
3358                 if (!iommu->iommu_state)
3359                         goto nomem;
3360         }
3361
3362         iommu_flush_all();
3363
3364         for_each_active_iommu(iommu, drhd) {
3365                 iommu_disable_translation(iommu);
3366
3367                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3368
3369                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3370                         readl(iommu->reg + DMAR_FECTL_REG);
3371                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3372                         readl(iommu->reg + DMAR_FEDATA_REG);
3373                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3374                         readl(iommu->reg + DMAR_FEADDR_REG);
3375                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3376                         readl(iommu->reg + DMAR_FEUADDR_REG);
3377
3378                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3379         }
3380         return 0;
3381
3382 nomem:
3383         for_each_active_iommu(iommu, drhd)
3384                 kfree(iommu->iommu_state);
3385
3386         return -ENOMEM;
3387 }
3388
3389 static void iommu_resume(void)
3390 {
3391         struct dmar_drhd_unit *drhd;
3392         struct intel_iommu *iommu = NULL;
3393         unsigned long flag;
3394
3395         if (init_iommu_hw()) {
3396                 if (force_on)
3397                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3398                 else
3399                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3400                 return;
3401         }
3402
3403         for_each_active_iommu(iommu, drhd) {
3404
3405                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3406
3407                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3408                         iommu->reg + DMAR_FECTL_REG);
3409                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3410                         iommu->reg + DMAR_FEDATA_REG);
3411                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3412                         iommu->reg + DMAR_FEADDR_REG);
3413                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3414                         iommu->reg + DMAR_FEUADDR_REG);
3415
3416                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3417         }
3418
3419         for_each_active_iommu(iommu, drhd)
3420                 kfree(iommu->iommu_state);
3421 }
3422
3423 static struct syscore_ops iommu_syscore_ops = {
3424         .resume         = iommu_resume,
3425         .suspend        = iommu_suspend,
3426 };
3427
3428 static void __init init_iommu_pm_ops(void)
3429 {
3430         register_syscore_ops(&iommu_syscore_ops);
3431 }
3432
3433 #else
3434 static inline void init_iommu_pm_ops(void) {}
3435 #endif  /* CONFIG_PM */
3436
3437 LIST_HEAD(dmar_rmrr_units);
3438
3439 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3440 {
3441         list_add(&rmrr->list, &dmar_rmrr_units);
3442 }
3443
3444
3445 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3446 {
3447         struct acpi_dmar_reserved_memory *rmrr;
3448         struct dmar_rmrr_unit *rmrru;
3449
3450         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3451         if (!rmrru)
3452                 return -ENOMEM;
3453
3454         rmrru->hdr = header;
3455         rmrr = (struct acpi_dmar_reserved_memory *)header;
3456         rmrru->base_address = rmrr->base_address;
3457         rmrru->end_address = rmrr->end_address;
3458
3459         dmar_register_rmrr_unit(rmrru);
3460         return 0;
3461 }
3462
3463 static int __init
3464 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3465 {
3466         struct acpi_dmar_reserved_memory *rmrr;
3467         int ret;
3468
3469         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3470         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3471                 ((void *)rmrr) + rmrr->header.length,
3472                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3473
3474         if (ret || (rmrru->devices_cnt == 0)) {
3475                 list_del(&rmrru->list);
3476                 kfree(rmrru);
3477         }
3478         return ret;
3479 }
3480
3481 static LIST_HEAD(dmar_atsr_units);
3482
3483 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3484 {
3485         struct acpi_dmar_atsr *atsr;
3486         struct dmar_atsr_unit *atsru;
3487
3488         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3489         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3490         if (!atsru)
3491                 return -ENOMEM;
3492
3493         atsru->hdr = hdr;
3494         atsru->include_all = atsr->flags & 0x1;
3495
3496         list_add(&atsru->list, &dmar_atsr_units);
3497
3498         return 0;
3499 }
3500
3501 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3502 {
3503         int rc;
3504         struct acpi_dmar_atsr *atsr;
3505
3506         if (atsru->include_all)
3507                 return 0;
3508
3509         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3510         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3511                                 (void *)atsr + atsr->header.length,
3512                                 &atsru->devices_cnt, &atsru->devices,
3513                                 atsr->segment);
3514         if (rc || !atsru->devices_cnt) {
3515                 list_del(&atsru->list);
3516                 kfree(atsru);
3517         }
3518
3519         return rc;
3520 }
3521
3522 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3523 {
3524         int i;
3525         struct pci_bus *bus;
3526         struct acpi_dmar_atsr *atsr;
3527         struct dmar_atsr_unit *atsru;
3528
3529         dev = pci_physfn(dev);
3530
3531         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3532                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3533                 if (atsr->segment == pci_domain_nr(dev->bus))
3534                         goto found;
3535         }
3536
3537         return 0;
3538
3539 found:
3540         for (bus = dev->bus; bus; bus = bus->parent) {
3541                 struct pci_dev *bridge = bus->self;
3542
3543                 if (!bridge || !pci_is_pcie(bridge) ||
3544                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3545                         return 0;
3546
3547                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3548                         for (i = 0; i < atsru->devices_cnt; i++)
3549                                 if (atsru->devices[i] == bridge)
3550                                         return 1;
3551                         break;
3552                 }
3553         }
3554
3555         if (atsru->include_all)
3556                 return 1;
3557
3558         return 0;
3559 }
3560
3561 int __init dmar_parse_rmrr_atsr_dev(void)
3562 {
3563         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3564         struct dmar_atsr_unit *atsr, *atsr_n;
3565         int ret = 0;
3566
3567         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3568                 ret = rmrr_parse_dev(rmrr);
3569                 if (ret)
3570                         return ret;
3571         }
3572
3573         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3574                 ret = atsr_parse_dev(atsr);
3575                 if (ret)
3576                         return ret;
3577         }
3578
3579         return ret;
3580 }
3581
3582 /*
3583  * Here we only respond to action of unbound device from driver.
3584  *
3585  * Added device is not attached to its DMAR domain here yet. That will happen
3586  * when mapping the device to iova.
3587  */
3588 static int device_notifier(struct notifier_block *nb,
3589                                   unsigned long action, void *data)
3590 {
3591         struct device *dev = data;
3592         struct pci_dev *pdev = to_pci_dev(dev);
3593         struct dmar_domain *domain;
3594
3595         if (iommu_no_mapping(dev))
3596                 return 0;
3597
3598         domain = find_domain(pdev);
3599         if (!domain)
3600                 return 0;
3601
3602         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3603                 domain_remove_one_dev_info(domain, pdev);
3604
3605                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3606                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3607                     list_empty(&domain->devices))
3608                         domain_exit(domain);
3609         }
3610
3611         return 0;
3612 }
3613
3614 static struct notifier_block device_nb = {
3615         .notifier_call = device_notifier,
3616 };
3617
3618 int __init intel_iommu_init(void)
3619 {
3620         int ret = 0;
3621
3622         /* VT-d is required for a TXT/tboot launch, so enforce that */
3623         force_on = tboot_force_iommu();
3624
3625         if (dmar_table_init()) {
3626                 if (force_on)
3627                         panic("tboot: Failed to initialize DMAR table\n");
3628                 return  -ENODEV;
3629         }
3630
3631         if (dmar_dev_scope_init() < 0) {
3632                 if (force_on)
3633                         panic("tboot: Failed to initialize DMAR device scope\n");
3634                 return  -ENODEV;
3635         }
3636
3637         if (no_iommu || dmar_disabled)
3638                 return -ENODEV;
3639
3640         if (iommu_init_mempool()) {
3641                 if (force_on)
3642                         panic("tboot: Failed to initialize iommu memory\n");
3643                 return  -ENODEV;
3644         }
3645
3646         if (list_empty(&dmar_rmrr_units))
3647                 printk(KERN_INFO "DMAR: No RMRR found\n");
3648
3649         if (list_empty(&dmar_atsr_units))
3650                 printk(KERN_INFO "DMAR: No ATSR found\n");
3651
3652         if (dmar_init_reserved_ranges()) {
3653                 if (force_on)
3654                         panic("tboot: Failed to reserve iommu ranges\n");
3655                 return  -ENODEV;
3656         }
3657
3658         init_no_remapping_devices();
3659
3660         ret = init_dmars();
3661         if (ret) {
3662                 if (force_on)
3663                         panic("tboot: Failed to initialize DMARs\n");
3664                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3665                 put_iova_domain(&reserved_iova_list);
3666                 iommu_exit_mempool();
3667                 return ret;
3668         }
3669         printk(KERN_INFO
3670         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3671
3672         init_timer(&unmap_timer);
3673 #ifdef CONFIG_SWIOTLB
3674         swiotlb = 0;
3675 #endif
3676         dma_ops = &intel_dma_ops;
3677
3678         init_iommu_pm_ops();
3679
3680         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3681
3682         bus_register_notifier(&pci_bus_type, &device_nb);
3683
3684         intel_iommu_enabled = 1;
3685
3686         return 0;
3687 }
3688
3689 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3690                                            struct pci_dev *pdev)
3691 {
3692         struct pci_dev *tmp, *parent;
3693
3694         if (!iommu || !pdev)
3695                 return;
3696
3697         /* dependent device detach */
3698         tmp = pci_find_upstream_pcie_bridge(pdev);
3699         /* Secondary interface's bus number and devfn 0 */
3700         if (tmp) {
3701                 parent = pdev->bus->self;
3702                 while (parent != tmp) {
3703                         iommu_detach_dev(iommu, parent->bus->number,
3704                                          parent->devfn);
3705                         parent = parent->bus->self;
3706                 }
3707                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3708                         iommu_detach_dev(iommu,
3709                                 tmp->subordinate->number, 0);
3710                 else /* this is a legacy PCI bridge */
3711                         iommu_detach_dev(iommu, tmp->bus->number,
3712                                          tmp->devfn);
3713         }
3714 }
3715
3716 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3717                                           struct pci_dev *pdev)
3718 {
3719         struct device_domain_info *info;
3720         struct intel_iommu *iommu;
3721         unsigned long flags;
3722         int found = 0;
3723         struct list_head *entry, *tmp;
3724
3725         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3726                                 pdev->devfn);
3727         if (!iommu)
3728                 return;
3729
3730         spin_lock_irqsave(&device_domain_lock, flags);
3731         list_for_each_safe(entry, tmp, &domain->devices) {
3732                 info = list_entry(entry, struct device_domain_info, link);
3733                 if (info->segment == pci_domain_nr(pdev->bus) &&
3734                     info->bus == pdev->bus->number &&
3735                     info->devfn == pdev->devfn) {
3736                         list_del(&info->link);
3737                         list_del(&info->global);
3738                         if (info->dev)
3739                                 info->dev->dev.archdata.iommu = NULL;
3740                         spin_unlock_irqrestore(&device_domain_lock, flags);
3741
3742                         iommu_disable_dev_iotlb(info);
3743                         iommu_detach_dev(iommu, info->bus, info->devfn);
3744                         iommu_detach_dependent_devices(iommu, pdev);
3745                         free_devinfo_mem(info);
3746
3747                         spin_lock_irqsave(&device_domain_lock, flags);
3748
3749                         if (found)
3750                                 break;
3751                         else
3752                                 continue;
3753                 }
3754
3755                 /* if there is no other devices under the same iommu
3756                  * owned by this domain, clear this iommu in iommu_bmp
3757                  * update iommu count and coherency
3758                  */
3759                 if (iommu == device_to_iommu(info->segment, info->bus,
3760                                             info->devfn))
3761                         found = 1;
3762         }
3763
3764         spin_unlock_irqrestore(&device_domain_lock, flags);
3765
3766         if (found == 0) {
3767                 unsigned long tmp_flags;
3768                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3769                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3770                 domain->iommu_count--;
3771                 domain_update_iommu_cap(domain);
3772                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3773
3774                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3775                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3776                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3777                         clear_bit(domain->id, iommu->domain_ids);
3778                         iommu->domains[domain->id] = NULL;
3779                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3780                 }
3781         }
3782 }
3783
3784 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3785 {
3786         struct device_domain_info *info;
3787         struct intel_iommu *iommu;
3788         unsigned long flags1, flags2;
3789
3790         spin_lock_irqsave(&device_domain_lock, flags1);
3791         while (!list_empty(&domain->devices)) {
3792                 info = list_entry(domain->devices.next,
3793                         struct device_domain_info, link);
3794                 list_del(&info->link);
3795                 list_del(&info->global);
3796                 if (info->dev)
3797                         info->dev->dev.archdata.iommu = NULL;
3798
3799                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3800
3801                 iommu_disable_dev_iotlb(info);
3802                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3803                 iommu_detach_dev(iommu, info->bus, info->devfn);
3804                 iommu_detach_dependent_devices(iommu, info->dev);
3805
3806                 /* clear this iommu in iommu_bmp, update iommu count
3807                  * and capabilities
3808                  */
3809                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3810                 if (test_and_clear_bit(iommu->seq_id,
3811                                        domain->iommu_bmp)) {
3812                         domain->iommu_count--;
3813                         domain_update_iommu_cap(domain);
3814                 }
3815                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3816
3817                 free_devinfo_mem(info);
3818                 spin_lock_irqsave(&device_domain_lock, flags1);
3819         }
3820         spin_unlock_irqrestore(&device_domain_lock, flags1);
3821 }
3822
3823 /* domain id for virtual machine, it won't be set in context */
3824 static unsigned long vm_domid;
3825
3826 static struct dmar_domain *iommu_alloc_vm_domain(void)
3827 {
3828         struct dmar_domain *domain;
3829
3830         domain = alloc_domain_mem();
3831         if (!domain)
3832                 return NULL;
3833
3834         domain->id = vm_domid++;
3835         domain->nid = -1;
3836         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3837         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3838
3839         return domain;
3840 }
3841
3842 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3843 {
3844         int adjust_width;
3845
3846         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3847         spin_lock_init(&domain->iommu_lock);
3848
3849         domain_reserve_special_ranges(domain);
3850
3851         /* calculate AGAW */
3852         domain->gaw = guest_width;
3853         adjust_width = guestwidth_to_adjustwidth(guest_width);
3854         domain->agaw = width_to_agaw(adjust_width);
3855
3856         INIT_LIST_HEAD(&domain->devices);
3857
3858         domain->iommu_count = 0;
3859         domain->iommu_coherency = 0;
3860         domain->iommu_snooping = 0;
3861         domain->iommu_superpage = 0;
3862         domain->max_addr = 0;
3863         domain->nid = -1;
3864
3865         /* always allocate the top pgd */
3866         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3867         if (!domain->pgd)
3868                 return -ENOMEM;
3869         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3870         return 0;
3871 }
3872
3873 static void iommu_free_vm_domain(struct dmar_domain *domain)
3874 {
3875         unsigned long flags;
3876         struct dmar_drhd_unit *drhd;
3877         struct intel_iommu *iommu;
3878         unsigned long i;
3879         unsigned long ndomains;
3880
3881         for_each_drhd_unit(drhd) {
3882                 if (drhd->ignored)
3883                         continue;
3884                 iommu = drhd->iommu;
3885
3886                 ndomains = cap_ndoms(iommu->cap);
3887                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3888                         if (iommu->domains[i] == domain) {
3889                                 spin_lock_irqsave(&iommu->lock, flags);
3890                                 clear_bit(i, iommu->domain_ids);
3891                                 iommu->domains[i] = NULL;
3892                                 spin_unlock_irqrestore(&iommu->lock, flags);
3893                                 break;
3894                         }
3895                 }
3896         }
3897 }
3898
3899 static void vm_domain_exit(struct dmar_domain *domain)
3900 {
3901         /* Domain 0 is reserved, so dont process it */
3902         if (!domain)
3903                 return;
3904
3905         vm_domain_remove_all_dev_info(domain);
3906         /* destroy iovas */
3907         put_iova_domain(&domain->iovad);
3908
3909         /* clear ptes */
3910         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3911
3912         /* free page tables */
3913         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3914
3915         iommu_free_vm_domain(domain);
3916         free_domain_mem(domain);
3917 }
3918
3919 static int intel_iommu_domain_init(struct iommu_domain *domain)
3920 {
3921         struct dmar_domain *dmar_domain;
3922
3923         dmar_domain = iommu_alloc_vm_domain();
3924         if (!dmar_domain) {
3925                 printk(KERN_ERR
3926                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3927                 return -ENOMEM;
3928         }
3929         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3930                 printk(KERN_ERR
3931                         "intel_iommu_domain_init() failed\n");
3932                 vm_domain_exit(dmar_domain);
3933                 return -ENOMEM;
3934         }
3935         domain_update_iommu_cap(dmar_domain);
3936         domain->priv = dmar_domain;
3937
3938         return 0;
3939 }
3940
3941 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3942 {
3943         struct dmar_domain *dmar_domain = domain->priv;
3944
3945         domain->priv = NULL;
3946         vm_domain_exit(dmar_domain);
3947 }
3948
3949 static int intel_iommu_attach_device(struct iommu_domain *domain,
3950                                      struct device *dev)
3951 {
3952         struct dmar_domain *dmar_domain = domain->priv;
3953         struct pci_dev *pdev = to_pci_dev(dev);
3954         struct intel_iommu *iommu;
3955         int addr_width;
3956
3957         /* normally pdev is not mapped */
3958         if (unlikely(domain_context_mapped(pdev))) {
3959                 struct dmar_domain *old_domain;
3960
3961                 old_domain = find_domain(pdev);
3962                 if (old_domain) {
3963                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3964                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3965                                 domain_remove_one_dev_info(old_domain, pdev);
3966                         else
3967                                 domain_remove_dev_info(old_domain);
3968                 }
3969         }
3970
3971         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3972                                 pdev->devfn);
3973         if (!iommu)
3974                 return -ENODEV;
3975
3976         /* check if this iommu agaw is sufficient for max mapped address */
3977         addr_width = agaw_to_width(iommu->agaw);
3978         if (addr_width > cap_mgaw(iommu->cap))
3979                 addr_width = cap_mgaw(iommu->cap);
3980
3981         if (dmar_domain->max_addr > (1LL << addr_width)) {
3982                 printk(KERN_ERR "%s: iommu width (%d) is not "
3983                        "sufficient for the mapped address (%llx)\n",
3984                        __func__, addr_width, dmar_domain->max_addr);
3985                 return -EFAULT;
3986         }
3987         dmar_domain->gaw = addr_width;
3988
3989         /*
3990          * Knock out extra levels of page tables if necessary
3991          */
3992         while (iommu->agaw < dmar_domain->agaw) {
3993                 struct dma_pte *pte;
3994
3995                 pte = dmar_domain->pgd;
3996                 if (dma_pte_present(pte)) {
3997                         dmar_domain->pgd = (struct dma_pte *)
3998                                 phys_to_virt(dma_pte_addr(pte));
3999                         free_pgtable_page(pte);
4000                 }
4001                 dmar_domain->agaw--;
4002         }
4003
4004         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4005 }
4006
4007 static void intel_iommu_detach_device(struct iommu_domain *domain,
4008                                       struct device *dev)
4009 {
4010         struct dmar_domain *dmar_domain = domain->priv;
4011         struct pci_dev *pdev = to_pci_dev(dev);
4012
4013         domain_remove_one_dev_info(dmar_domain, pdev);
4014 }
4015
4016 static int intel_iommu_map(struct iommu_domain *domain,
4017                            unsigned long iova, phys_addr_t hpa,
4018                            size_t size, int iommu_prot)
4019 {
4020         struct dmar_domain *dmar_domain = domain->priv;
4021         u64 max_addr;
4022         int prot = 0;
4023         int ret;
4024
4025         if (iommu_prot & IOMMU_READ)
4026                 prot |= DMA_PTE_READ;
4027         if (iommu_prot & IOMMU_WRITE)
4028                 prot |= DMA_PTE_WRITE;
4029         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4030                 prot |= DMA_PTE_SNP;
4031
4032         max_addr = iova + size;
4033         if (dmar_domain->max_addr < max_addr) {
4034                 u64 end;
4035
4036                 /* check if minimum agaw is sufficient for mapped address */
4037                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4038                 if (end < max_addr) {
4039                         printk(KERN_ERR "%s: iommu width (%d) is not "
4040                                "sufficient for the mapped address (%llx)\n",
4041                                __func__, dmar_domain->gaw, max_addr);
4042                         return -EFAULT;
4043                 }
4044                 dmar_domain->max_addr = max_addr;
4045         }
4046         /* Round up size to next multiple of PAGE_SIZE, if it and
4047            the low bits of hpa would take us onto the next page */
4048         size = aligned_nrpages(hpa, size);
4049         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4050                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4051         return ret;
4052 }
4053
4054 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4055                              unsigned long iova, size_t size)
4056 {
4057         struct dmar_domain *dmar_domain = domain->priv;
4058         int order;
4059
4060         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4061                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4062
4063         if (dmar_domain->max_addr == iova + size)
4064                 dmar_domain->max_addr = iova;
4065
4066         return PAGE_SIZE << order;
4067 }
4068
4069 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4070                                             unsigned long iova)
4071 {
4072         struct dmar_domain *dmar_domain = domain->priv;
4073         struct dma_pte *pte;
4074         u64 phys = 0;
4075
4076         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4077         if (pte)
4078                 phys = dma_pte_addr(pte);
4079
4080         return phys;
4081 }
4082
4083 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4084                                       unsigned long cap)
4085 {
4086         struct dmar_domain *dmar_domain = domain->priv;
4087
4088         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4089                 return dmar_domain->iommu_snooping;
4090         if (cap == IOMMU_CAP_INTR_REMAP)
4091                 return irq_remapping_enabled;
4092
4093         return 0;
4094 }
4095
4096 /*
4097  * Group numbers are arbitrary.  Device with the same group number
4098  * indicate the iommu cannot differentiate between them.  To avoid
4099  * tracking used groups we just use the seg|bus|devfn of the lowest
4100  * level we're able to differentiate devices
4101  */
4102 static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4103 {
4104         struct pci_dev *pdev = to_pci_dev(dev);
4105         struct pci_dev *bridge;
4106         union {
4107                 struct {
4108                         u8 devfn;
4109                         u8 bus;
4110                         u16 segment;
4111                 } pci;
4112                 u32 group;
4113         } id;
4114
4115         if (iommu_no_mapping(dev))
4116                 return -ENODEV;
4117
4118         id.pci.segment = pci_domain_nr(pdev->bus);
4119         id.pci.bus = pdev->bus->number;
4120         id.pci.devfn = pdev->devfn;
4121
4122         if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4123                 return -ENODEV;
4124
4125         bridge = pci_find_upstream_pcie_bridge(pdev);
4126         if (bridge) {
4127                 if (pci_is_pcie(bridge)) {
4128                         id.pci.bus = bridge->subordinate->number;
4129                         id.pci.devfn = 0;
4130                 } else {
4131                         id.pci.bus = bridge->bus->number;
4132                         id.pci.devfn = bridge->devfn;
4133                 }
4134         }
4135
4136         if (!pdev->is_virtfn && iommu_group_mf)
4137                 id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4138
4139         *groupid = id.group;
4140
4141         return 0;
4142 }
4143
4144 static struct iommu_ops intel_iommu_ops = {
4145         .domain_init    = intel_iommu_domain_init,
4146         .domain_destroy = intel_iommu_domain_destroy,
4147         .attach_dev     = intel_iommu_attach_device,
4148         .detach_dev     = intel_iommu_detach_device,
4149         .map            = intel_iommu_map,
4150         .unmap          = intel_iommu_unmap,
4151         .iova_to_phys   = intel_iommu_iova_to_phys,
4152         .domain_has_cap = intel_iommu_domain_has_cap,
4153         .device_group   = intel_iommu_device_group,
4154         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4155 };
4156
4157 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4158 {
4159         /*
4160          * Mobile 4 Series Chipset neglects to set RWBF capability,
4161          * but needs it:
4162          */
4163         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4164         rwbf_quirk = 1;
4165
4166         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4167         if (dev->revision == 0x07) {
4168                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4169                 dmar_map_gfx = 0;
4170         }
4171 }
4172
4173 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4174
4175 #define GGC 0x52
4176 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4177 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4178 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4179 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4180 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4181 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4182 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4183 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4184
4185 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4186 {
4187         unsigned short ggc;
4188
4189         if (pci_read_config_word(dev, GGC, &ggc))
4190                 return;
4191
4192         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4193                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4194                 dmar_map_gfx = 0;
4195         } else if (dmar_map_gfx) {
4196                 /* we have to ensure the gfx device is idle before we flush */
4197                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4198                 intel_iommu_strict = 1;
4199        }
4200 }
4201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4202 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4203 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4204 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4205
4206 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4207    ISOCH DMAR unit for the Azalia sound device, but not give it any
4208    TLB entries, which causes it to deadlock. Check for that.  We do
4209    this in a function called from init_dmars(), instead of in a PCI
4210    quirk, because we don't want to print the obnoxious "BIOS broken"
4211    message if VT-d is actually disabled.
4212 */
4213 static void __init check_tylersburg_isoch(void)
4214 {
4215         struct pci_dev *pdev;
4216         uint32_t vtisochctrl;
4217
4218         /* If there's no Azalia in the system anyway, forget it. */
4219         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4220         if (!pdev)
4221                 return;
4222         pci_dev_put(pdev);
4223
4224         /* System Management Registers. Might be hidden, in which case
4225            we can't do the sanity check. But that's OK, because the
4226            known-broken BIOSes _don't_ actually hide it, so far. */
4227         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4228         if (!pdev)
4229                 return;
4230
4231         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4232                 pci_dev_put(pdev);
4233                 return;
4234         }
4235
4236         pci_dev_put(pdev);
4237
4238         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4239         if (vtisochctrl & 1)
4240                 return;
4241
4242         /* Drop all bits other than the number of TLB entries */
4243         vtisochctrl &= 0x1c;
4244
4245         /* If we have the recommended number of TLB entries (16), fine. */
4246         if (vtisochctrl == 0x10)
4247                 return;
4248
4249         /* Zero TLB entries? You get to ride the short bus to school. */
4250         if (!vtisochctrl) {
4251                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4252                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4253                      dmi_get_system_info(DMI_BIOS_VENDOR),
4254                      dmi_get_system_info(DMI_BIOS_VERSION),
4255                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4256                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4257                 return;
4258         }
4259         
4260         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4261                vtisochctrl);
4262 }