iommu/vt-d: fix memory leakage caused by commit ea8ea46
[firefly-linux-kernel-4.4.55.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  */
19
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <asm/irq_remapping.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
45
46 #include "irq_remapping.h"
47 #include "pci.h"
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
64
65 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
71                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73
74 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
77
78 /* page table handling */
79 #define LEVEL_STRIDE            (9)
80 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
81
82 /*
83  * This bitmap is used to advertise the page sizes our hardware support
84  * to the IOMMU core, which will then use this information to split
85  * physically contiguous memory regions it is mapping into page sizes
86  * that we support.
87  *
88  * Traditionally the IOMMU core just handed us the mappings directly,
89  * after making sure the size is an order of a 4KiB page and that the
90  * mapping has natural alignment.
91  *
92  * To retain this behavior, we currently advertise that we support
93  * all page sizes that are an order of 4KiB.
94  *
95  * If at some point we'd like to utilize the IOMMU core's new behavior,
96  * we could change this to advertise the real page sizes we support.
97  */
98 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
99
100 static inline int agaw_to_level(int agaw)
101 {
102         return agaw + 2;
103 }
104
105 static inline int agaw_to_width(int agaw)
106 {
107         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
108 }
109
110 static inline int width_to_agaw(int width)
111 {
112         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
113 }
114
115 static inline unsigned int level_to_offset_bits(int level)
116 {
117         return (level - 1) * LEVEL_STRIDE;
118 }
119
120 static inline int pfn_level_offset(unsigned long pfn, int level)
121 {
122         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
123 }
124
125 static inline unsigned long level_mask(int level)
126 {
127         return -1UL << level_to_offset_bits(level);
128 }
129
130 static inline unsigned long level_size(int level)
131 {
132         return 1UL << level_to_offset_bits(level);
133 }
134
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
136 {
137         return (pfn + level_size(level) - 1) & level_mask(level);
138 }
139
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
141 {
142         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
143 }
144
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146    are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
148 {
149         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
150 }
151
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
153 {
154         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
157 {
158         return mm_to_dma_pfn(page_to_pfn(pg));
159 }
160 static inline unsigned long virt_to_dma_pfn(void *p)
161 {
162         return page_to_dma_pfn(virt_to_page(p));
163 }
164
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
167
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
170
171 /*
172  * set to 1 to panic kernel if can't successfully enable VT-d
173  * (used when kernel is launched w/ TXT)
174  */
175 static int force_on = 0;
176
177 /*
178  * 0: Present
179  * 1-11: Reserved
180  * 12-63: Context Ptr (12 - (haw-1))
181  * 64-127: Reserved
182  */
183 struct root_entry {
184         u64     val;
185         u64     rsvd1;
186 };
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
189 {
190         return (root->val & 1);
191 }
192 static inline void set_root_present(struct root_entry *root)
193 {
194         root->val |= 1;
195 }
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
197 {
198         root->val |= value & VTD_PAGE_MASK;
199 }
200
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
203 {
204         return (struct context_entry *)
205                 (root_present(root)?phys_to_virt(
206                 root->val & VTD_PAGE_MASK) :
207                 NULL);
208 }
209
210 /*
211  * low 64 bits:
212  * 0: present
213  * 1: fault processing disable
214  * 2-3: translation type
215  * 12-63: address space root
216  * high 64 bits:
217  * 0-2: address width
218  * 3-6: aval
219  * 8-23: domain id
220  */
221 struct context_entry {
222         u64 lo;
223         u64 hi;
224 };
225
226 static inline bool context_present(struct context_entry *context)
227 {
228         return (context->lo & 1);
229 }
230 static inline void context_set_present(struct context_entry *context)
231 {
232         context->lo |= 1;
233 }
234
235 static inline void context_set_fault_enable(struct context_entry *context)
236 {
237         context->lo &= (((u64)-1) << 2) | 1;
238 }
239
240 static inline void context_set_translation_type(struct context_entry *context,
241                                                 unsigned long value)
242 {
243         context->lo &= (((u64)-1) << 4) | 3;
244         context->lo |= (value & 3) << 2;
245 }
246
247 static inline void context_set_address_root(struct context_entry *context,
248                                             unsigned long value)
249 {
250         context->lo |= value & VTD_PAGE_MASK;
251 }
252
253 static inline void context_set_address_width(struct context_entry *context,
254                                              unsigned long value)
255 {
256         context->hi |= value & 7;
257 }
258
259 static inline void context_set_domain_id(struct context_entry *context,
260                                          unsigned long value)
261 {
262         context->hi |= (value & ((1 << 16) - 1)) << 8;
263 }
264
265 static inline void context_clear_entry(struct context_entry *context)
266 {
267         context->lo = 0;
268         context->hi = 0;
269 }
270
271 /*
272  * 0: readable
273  * 1: writable
274  * 2-6: reserved
275  * 7: super page
276  * 8-10: available
277  * 11: snoop behavior
278  * 12-63: Host physcial address
279  */
280 struct dma_pte {
281         u64 val;
282 };
283
284 static inline void dma_clear_pte(struct dma_pte *pte)
285 {
286         pte->val = 0;
287 }
288
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
290 {
291 #ifdef CONFIG_64BIT
292         return pte->val & VTD_PAGE_MASK;
293 #else
294         /* Must have a full atomic 64-bit read */
295         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296 #endif
297 }
298
299 static inline bool dma_pte_present(struct dma_pte *pte)
300 {
301         return (pte->val & 3) != 0;
302 }
303
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
305 {
306         return (pte->val & (1 << 7));
307 }
308
309 static inline int first_pte_in_page(struct dma_pte *pte)
310 {
311         return !((unsigned long)pte & ~VTD_PAGE_MASK);
312 }
313
314 /*
315  * This domain is a statically identity mapping domain.
316  *      1. This domain creats a static 1:1 mapping to all usable memory.
317  *      2. It maps to each iommu if successful.
318  *      3. Each iommu mapps to this domain if successful.
319  */
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
322
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
325
326 /* domain represents a virtual machine, more than one devices
327  * across iommus may be owned in one domain, e.g. kvm guest.
328  */
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
330
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
333
334 /* define the limit of IOMMUs supported in each domain */
335 #ifdef  CONFIG_X86
336 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
337 #else
338 # define        IOMMU_UNITS_SUPPORTED   64
339 #endif
340
341 struct dmar_domain {
342         int     id;                     /* domain id */
343         int     nid;                    /* node id */
344         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
345                                         /* bitmap of iommus this domain uses*/
346
347         struct list_head devices;       /* all devices' list */
348         struct iova_domain iovad;       /* iova's that belong to this domain */
349
350         struct dma_pte  *pgd;           /* virtual address */
351         int             gaw;            /* max guest address width */
352
353         /* adjusted guest address width, 0 is level 2 30-bit */
354         int             agaw;
355
356         int             flags;          /* flags to find out type of domain */
357
358         int             iommu_coherency;/* indicate coherency of iommu access */
359         int             iommu_snooping; /* indicate snooping control feature*/
360         int             iommu_count;    /* reference count of iommu */
361         int             iommu_superpage;/* Level of superpages supported:
362                                            0 == 4KiB (no superpages), 1 == 2MiB,
363                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
364         spinlock_t      iommu_lock;     /* protect iommu set in domain */
365         u64             max_addr;       /* maximum mapped address */
366 };
367
368 /* PCI domain-device relationship */
369 struct device_domain_info {
370         struct list_head link;  /* link to domain siblings */
371         struct list_head global; /* link to global list */
372         u8 bus;                 /* PCI bus number */
373         u8 devfn;               /* PCI devfn number */
374         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
375         struct intel_iommu *iommu; /* IOMMU used by this device */
376         struct dmar_domain *domain; /* pointer to domain */
377 };
378
379 struct dmar_rmrr_unit {
380         struct list_head list;          /* list of rmrr units   */
381         struct acpi_dmar_header *hdr;   /* ACPI header          */
382         u64     base_address;           /* reserved base address*/
383         u64     end_address;            /* reserved end address */
384         struct dmar_dev_scope *devices; /* target devices */
385         int     devices_cnt;            /* target device count */
386 };
387
388 struct dmar_atsr_unit {
389         struct list_head list;          /* list of ATSR units */
390         struct acpi_dmar_header *hdr;   /* ACPI header */
391         struct dmar_dev_scope *devices; /* target devices */
392         int devices_cnt;                /* target device count */
393         u8 include_all:1;               /* include all ports */
394 };
395
396 static LIST_HEAD(dmar_atsr_units);
397 static LIST_HEAD(dmar_rmrr_units);
398
399 #define for_each_rmrr_units(rmrr) \
400         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
401
402 static void flush_unmaps_timeout(unsigned long data);
403
404 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
405
406 #define HIGH_WATER_MARK 250
407 struct deferred_flush_tables {
408         int next;
409         struct iova *iova[HIGH_WATER_MARK];
410         struct dmar_domain *domain[HIGH_WATER_MARK];
411         struct page *freelist[HIGH_WATER_MARK];
412 };
413
414 static struct deferred_flush_tables *deferred_flush;
415
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
418
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
421
422 static int timer_on;
423 static long list_size;
424
425 static void domain_exit(struct dmar_domain *domain);
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427 static void domain_remove_one_dev_info(struct dmar_domain *domain,
428                                        struct device *dev);
429 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
430                                            struct device *dev);
431
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
434 #else
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
437
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
440
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
445
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
448
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
452
453 static struct iommu_ops intel_iommu_ops;
454
455 static int __init intel_iommu_setup(char *str)
456 {
457         if (!str)
458                 return -EINVAL;
459         while (*str) {
460                 if (!strncmp(str, "on", 2)) {
461                         dmar_disabled = 0;
462                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
463                 } else if (!strncmp(str, "off", 3)) {
464                         dmar_disabled = 1;
465                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
466                 } else if (!strncmp(str, "igfx_off", 8)) {
467                         dmar_map_gfx = 0;
468                         printk(KERN_INFO
469                                 "Intel-IOMMU: disable GFX device mapping\n");
470                 } else if (!strncmp(str, "forcedac", 8)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
473                         dmar_forcedac = 1;
474                 } else if (!strncmp(str, "strict", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable batched IOTLB flush\n");
477                         intel_iommu_strict = 1;
478                 } else if (!strncmp(str, "sp_off", 6)) {
479                         printk(KERN_INFO
480                                 "Intel-IOMMU: disable supported super page\n");
481                         intel_iommu_superpage = 0;
482                 }
483
484                 str += strcspn(str, ",");
485                 while (*str == ',')
486                         str++;
487         }
488         return 0;
489 }
490 __setup("intel_iommu=", intel_iommu_setup);
491
492 static struct kmem_cache *iommu_domain_cache;
493 static struct kmem_cache *iommu_devinfo_cache;
494 static struct kmem_cache *iommu_iova_cache;
495
496 static inline void *alloc_pgtable_page(int node)
497 {
498         struct page *page;
499         void *vaddr = NULL;
500
501         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
502         if (page)
503                 vaddr = page_address(page);
504         return vaddr;
505 }
506
507 static inline void free_pgtable_page(void *vaddr)
508 {
509         free_page((unsigned long)vaddr);
510 }
511
512 static inline void *alloc_domain_mem(void)
513 {
514         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
515 }
516
517 static void free_domain_mem(void *vaddr)
518 {
519         kmem_cache_free(iommu_domain_cache, vaddr);
520 }
521
522 static inline void * alloc_devinfo_mem(void)
523 {
524         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
525 }
526
527 static inline void free_devinfo_mem(void *vaddr)
528 {
529         kmem_cache_free(iommu_devinfo_cache, vaddr);
530 }
531
532 struct iova *alloc_iova_mem(void)
533 {
534         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
535 }
536
537 void free_iova_mem(struct iova *iova)
538 {
539         kmem_cache_free(iommu_iova_cache, iova);
540 }
541
542
543 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
544 {
545         unsigned long sagaw;
546         int agaw = -1;
547
548         sagaw = cap_sagaw(iommu->cap);
549         for (agaw = width_to_agaw(max_gaw);
550              agaw >= 0; agaw--) {
551                 if (test_bit(agaw, &sagaw))
552                         break;
553         }
554
555         return agaw;
556 }
557
558 /*
559  * Calculate max SAGAW for each iommu.
560  */
561 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
562 {
563         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
564 }
565
566 /*
567  * calculate agaw for each iommu.
568  * "SAGAW" may be different across iommus, use a default agaw, and
569  * get a supported less agaw for iommus that don't support the default agaw.
570  */
571 int iommu_calculate_agaw(struct intel_iommu *iommu)
572 {
573         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
574 }
575
576 /* This functionin only returns single iommu in a domain */
577 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
578 {
579         int iommu_id;
580
581         /* si_domain and vm domain should not get here. */
582         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
583         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
584
585         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
586         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
587                 return NULL;
588
589         return g_iommus[iommu_id];
590 }
591
592 static void domain_update_iommu_coherency(struct dmar_domain *domain)
593 {
594         struct dmar_drhd_unit *drhd;
595         struct intel_iommu *iommu;
596         int i, found = 0;
597
598         domain->iommu_coherency = 1;
599
600         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
601                 found = 1;
602                 if (!ecap_coherent(g_iommus[i]->ecap)) {
603                         domain->iommu_coherency = 0;
604                         break;
605                 }
606         }
607         if (found)
608                 return;
609
610         /* No hardware attached; use lowest common denominator */
611         rcu_read_lock();
612         for_each_active_iommu(iommu, drhd) {
613                 if (!ecap_coherent(iommu->ecap)) {
614                         domain->iommu_coherency = 0;
615                         break;
616                 }
617         }
618         rcu_read_unlock();
619 }
620
621 static void domain_update_iommu_snooping(struct dmar_domain *domain)
622 {
623         int i;
624
625         domain->iommu_snooping = 1;
626
627         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
628                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
629                         domain->iommu_snooping = 0;
630                         break;
631                 }
632         }
633 }
634
635 static void domain_update_iommu_superpage(struct dmar_domain *domain)
636 {
637         struct dmar_drhd_unit *drhd;
638         struct intel_iommu *iommu = NULL;
639         int mask = 0xf;
640
641         if (!intel_iommu_superpage) {
642                 domain->iommu_superpage = 0;
643                 return;
644         }
645
646         /* set iommu_superpage to the smallest common denominator */
647         rcu_read_lock();
648         for_each_active_iommu(iommu, drhd) {
649                 mask &= cap_super_page_val(iommu->cap);
650                 if (!mask) {
651                         break;
652                 }
653         }
654         rcu_read_unlock();
655
656         domain->iommu_superpage = fls(mask);
657 }
658
659 /* Some capabilities may be different across iommus */
660 static void domain_update_iommu_cap(struct dmar_domain *domain)
661 {
662         domain_update_iommu_coherency(domain);
663         domain_update_iommu_snooping(domain);
664         domain_update_iommu_superpage(domain);
665 }
666
667 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
668 {
669         struct dmar_drhd_unit *drhd = NULL;
670         struct intel_iommu *iommu;
671         struct device *tmp;
672         struct pci_dev *ptmp, *pdev = NULL;
673         u16 segment;
674         int i;
675
676         if (dev_is_pci(dev)) {
677                 pdev = to_pci_dev(dev);
678                 segment = pci_domain_nr(pdev->bus);
679         } else if (ACPI_COMPANION(dev))
680                 dev = &ACPI_COMPANION(dev)->dev;
681
682         rcu_read_lock();
683         for_each_active_iommu(iommu, drhd) {
684                 if (pdev && segment != drhd->segment)
685                         continue;
686
687                 for_each_active_dev_scope(drhd->devices,
688                                           drhd->devices_cnt, i, tmp) {
689                         if (tmp == dev) {
690                                 *bus = drhd->devices[i].bus;
691                                 *devfn = drhd->devices[i].devfn;
692                                 goto out;
693                         }
694
695                         if (!pdev || !dev_is_pci(tmp))
696                                 continue;
697
698                         ptmp = to_pci_dev(tmp);
699                         if (ptmp->subordinate &&
700                             ptmp->subordinate->number <= pdev->bus->number &&
701                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
702                                 goto got_pdev;
703                 }
704
705                 if (pdev && drhd->include_all) {
706                 got_pdev:
707                         *bus = pdev->bus->number;
708                         *devfn = pdev->devfn;
709                         goto out;
710                 }
711         }
712         iommu = NULL;
713  out:
714         rcu_read_unlock();
715
716         return iommu;
717 }
718
719 static void domain_flush_cache(struct dmar_domain *domain,
720                                void *addr, int size)
721 {
722         if (!domain->iommu_coherency)
723                 clflush_cache_range(addr, size);
724 }
725
726 /* Gets context entry for a given bus and devfn */
727 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
728                 u8 bus, u8 devfn)
729 {
730         struct root_entry *root;
731         struct context_entry *context;
732         unsigned long phy_addr;
733         unsigned long flags;
734
735         spin_lock_irqsave(&iommu->lock, flags);
736         root = &iommu->root_entry[bus];
737         context = get_context_addr_from_root(root);
738         if (!context) {
739                 context = (struct context_entry *)
740                                 alloc_pgtable_page(iommu->node);
741                 if (!context) {
742                         spin_unlock_irqrestore(&iommu->lock, flags);
743                         return NULL;
744                 }
745                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
746                 phy_addr = virt_to_phys((void *)context);
747                 set_root_value(root, phy_addr);
748                 set_root_present(root);
749                 __iommu_flush_cache(iommu, root, sizeof(*root));
750         }
751         spin_unlock_irqrestore(&iommu->lock, flags);
752         return &context[devfn];
753 }
754
755 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
756 {
757         struct root_entry *root;
758         struct context_entry *context;
759         int ret;
760         unsigned long flags;
761
762         spin_lock_irqsave(&iommu->lock, flags);
763         root = &iommu->root_entry[bus];
764         context = get_context_addr_from_root(root);
765         if (!context) {
766                 ret = 0;
767                 goto out;
768         }
769         ret = context_present(&context[devfn]);
770 out:
771         spin_unlock_irqrestore(&iommu->lock, flags);
772         return ret;
773 }
774
775 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
776 {
777         struct root_entry *root;
778         struct context_entry *context;
779         unsigned long flags;
780
781         spin_lock_irqsave(&iommu->lock, flags);
782         root = &iommu->root_entry[bus];
783         context = get_context_addr_from_root(root);
784         if (context) {
785                 context_clear_entry(&context[devfn]);
786                 __iommu_flush_cache(iommu, &context[devfn], \
787                         sizeof(*context));
788         }
789         spin_unlock_irqrestore(&iommu->lock, flags);
790 }
791
792 static void free_context_table(struct intel_iommu *iommu)
793 {
794         struct root_entry *root;
795         int i;
796         unsigned long flags;
797         struct context_entry *context;
798
799         spin_lock_irqsave(&iommu->lock, flags);
800         if (!iommu->root_entry) {
801                 goto out;
802         }
803         for (i = 0; i < ROOT_ENTRY_NR; i++) {
804                 root = &iommu->root_entry[i];
805                 context = get_context_addr_from_root(root);
806                 if (context)
807                         free_pgtable_page(context);
808         }
809         free_pgtable_page(iommu->root_entry);
810         iommu->root_entry = NULL;
811 out:
812         spin_unlock_irqrestore(&iommu->lock, flags);
813 }
814
815 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
816                                       unsigned long pfn, int *target_level)
817 {
818         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
819         struct dma_pte *parent, *pte = NULL;
820         int level = agaw_to_level(domain->agaw);
821         int offset;
822
823         BUG_ON(!domain->pgd);
824
825         if (addr_width < BITS_PER_LONG && pfn >> addr_width)
826                 /* Address beyond IOMMU's addressing capabilities. */
827                 return NULL;
828
829         parent = domain->pgd;
830
831         while (1) {
832                 void *tmp_page;
833
834                 offset = pfn_level_offset(pfn, level);
835                 pte = &parent[offset];
836                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
837                         break;
838                 if (level == *target_level)
839                         break;
840
841                 if (!dma_pte_present(pte)) {
842                         uint64_t pteval;
843
844                         tmp_page = alloc_pgtable_page(domain->nid);
845
846                         if (!tmp_page)
847                                 return NULL;
848
849                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
850                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
851                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
852                                 /* Someone else set it while we were thinking; use theirs. */
853                                 free_pgtable_page(tmp_page);
854                         } else {
855                                 dma_pte_addr(pte);
856                                 domain_flush_cache(domain, pte, sizeof(*pte));
857                         }
858                 }
859                 if (level == 1)
860                         break;
861
862                 parent = phys_to_virt(dma_pte_addr(pte));
863                 level--;
864         }
865
866         if (!*target_level)
867                 *target_level = level;
868
869         return pte;
870 }
871
872
873 /* return address's pte at specific level */
874 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
875                                          unsigned long pfn,
876                                          int level, int *large_page)
877 {
878         struct dma_pte *parent, *pte = NULL;
879         int total = agaw_to_level(domain->agaw);
880         int offset;
881
882         parent = domain->pgd;
883         while (level <= total) {
884                 offset = pfn_level_offset(pfn, total);
885                 pte = &parent[offset];
886                 if (level == total)
887                         return pte;
888
889                 if (!dma_pte_present(pte)) {
890                         *large_page = total;
891                         break;
892                 }
893
894                 if (pte->val & DMA_PTE_LARGE_PAGE) {
895                         *large_page = total;
896                         return pte;
897                 }
898
899                 parent = phys_to_virt(dma_pte_addr(pte));
900                 total--;
901         }
902         return NULL;
903 }
904
905 /* clear last level pte, a tlb flush should be followed */
906 static void dma_pte_clear_range(struct dmar_domain *domain,
907                                 unsigned long start_pfn,
908                                 unsigned long last_pfn)
909 {
910         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
911         unsigned int large_page = 1;
912         struct dma_pte *first_pte, *pte;
913
914         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
915         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
916         BUG_ON(start_pfn > last_pfn);
917
918         /* we don't need lock here; nobody else touches the iova range */
919         do {
920                 large_page = 1;
921                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
922                 if (!pte) {
923                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
924                         continue;
925                 }
926                 do {
927                         dma_clear_pte(pte);
928                         start_pfn += lvl_to_nr_pages(large_page);
929                         pte++;
930                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
931
932                 domain_flush_cache(domain, first_pte,
933                                    (void *)pte - (void *)first_pte);
934
935         } while (start_pfn && start_pfn <= last_pfn);
936 }
937
938 static void dma_pte_free_level(struct dmar_domain *domain, int level,
939                                struct dma_pte *pte, unsigned long pfn,
940                                unsigned long start_pfn, unsigned long last_pfn)
941 {
942         pfn = max(start_pfn, pfn);
943         pte = &pte[pfn_level_offset(pfn, level)];
944
945         do {
946                 unsigned long level_pfn;
947                 struct dma_pte *level_pte;
948
949                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
950                         goto next;
951
952                 level_pfn = pfn & level_mask(level - 1);
953                 level_pte = phys_to_virt(dma_pte_addr(pte));
954
955                 if (level > 2)
956                         dma_pte_free_level(domain, level - 1, level_pte,
957                                            level_pfn, start_pfn, last_pfn);
958
959                 /* If range covers entire pagetable, free it */
960                 if (!(start_pfn > level_pfn ||
961                       last_pfn < level_pfn + level_size(level) - 1)) {
962                         dma_clear_pte(pte);
963                         domain_flush_cache(domain, pte, sizeof(*pte));
964                         free_pgtable_page(level_pte);
965                 }
966 next:
967                 pfn += level_size(level);
968         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
969 }
970
971 /* free page table pages. last level pte should already be cleared */
972 static void dma_pte_free_pagetable(struct dmar_domain *domain,
973                                    unsigned long start_pfn,
974                                    unsigned long last_pfn)
975 {
976         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
977
978         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
979         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
980         BUG_ON(start_pfn > last_pfn);
981
982         /* We don't need lock here; nobody else touches the iova range */
983         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
984                            domain->pgd, 0, start_pfn, last_pfn);
985
986         /* free pgd */
987         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
988                 free_pgtable_page(domain->pgd);
989                 domain->pgd = NULL;
990         }
991 }
992
993 /* When a page at a given level is being unlinked from its parent, we don't
994    need to *modify* it at all. All we need to do is make a list of all the
995    pages which can be freed just as soon as we've flushed the IOTLB and we
996    know the hardware page-walk will no longer touch them.
997    The 'pte' argument is the *parent* PTE, pointing to the page that is to
998    be freed. */
999 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1000                                             int level, struct dma_pte *pte,
1001                                             struct page *freelist)
1002 {
1003         struct page *pg;
1004
1005         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1006         pg->freelist = freelist;
1007         freelist = pg;
1008
1009         if (level == 1)
1010                 return freelist;
1011
1012         pte = page_address(pg);
1013         do {
1014                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1015                         freelist = dma_pte_list_pagetables(domain, level - 1,
1016                                                            pte, freelist);
1017                 pte++;
1018         } while (!first_pte_in_page(pte));
1019
1020         return freelist;
1021 }
1022
1023 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1024                                         struct dma_pte *pte, unsigned long pfn,
1025                                         unsigned long start_pfn,
1026                                         unsigned long last_pfn,
1027                                         struct page *freelist)
1028 {
1029         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1030
1031         pfn = max(start_pfn, pfn);
1032         pte = &pte[pfn_level_offset(pfn, level)];
1033
1034         do {
1035                 unsigned long level_pfn;
1036
1037                 if (!dma_pte_present(pte))
1038                         goto next;
1039
1040                 level_pfn = pfn & level_mask(level);
1041
1042                 /* If range covers entire pagetable, free it */
1043                 if (start_pfn <= level_pfn &&
1044                     last_pfn >= level_pfn + level_size(level) - 1) {
1045                         /* These suborbinate page tables are going away entirely. Don't
1046                            bother to clear them; we're just going to *free* them. */
1047                         if (level > 1 && !dma_pte_superpage(pte))
1048                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1049
1050                         dma_clear_pte(pte);
1051                         if (!first_pte)
1052                                 first_pte = pte;
1053                         last_pte = pte;
1054                 } else if (level > 1) {
1055                         /* Recurse down into a level that isn't *entirely* obsolete */
1056                         freelist = dma_pte_clear_level(domain, level - 1,
1057                                                        phys_to_virt(dma_pte_addr(pte)),
1058                                                        level_pfn, start_pfn, last_pfn,
1059                                                        freelist);
1060                 }
1061 next:
1062                 pfn += level_size(level);
1063         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1064
1065         if (first_pte)
1066                 domain_flush_cache(domain, first_pte,
1067                                    (void *)++last_pte - (void *)first_pte);
1068
1069         return freelist;
1070 }
1071
1072 /* We can't just free the pages because the IOMMU may still be walking
1073    the page tables, and may have cached the intermediate levels. The
1074    pages can only be freed after the IOTLB flush has been done. */
1075 struct page *domain_unmap(struct dmar_domain *domain,
1076                           unsigned long start_pfn,
1077                           unsigned long last_pfn)
1078 {
1079         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1080         struct page *freelist = NULL;
1081
1082         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1083         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1084         BUG_ON(start_pfn > last_pfn);
1085
1086         /* we don't need lock here; nobody else touches the iova range */
1087         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1088                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1089
1090         /* free pgd */
1091         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1092                 struct page *pgd_page = virt_to_page(domain->pgd);
1093                 pgd_page->freelist = freelist;
1094                 freelist = pgd_page;
1095
1096                 domain->pgd = NULL;
1097         }
1098
1099         return freelist;
1100 }
1101
1102 void dma_free_pagelist(struct page *freelist)
1103 {
1104         struct page *pg;
1105
1106         while ((pg = freelist)) {
1107                 freelist = pg->freelist;
1108                 free_pgtable_page(page_address(pg));
1109         }
1110 }
1111
1112 /* iommu handling */
1113 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1114 {
1115         struct root_entry *root;
1116         unsigned long flags;
1117
1118         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1119         if (!root)
1120                 return -ENOMEM;
1121
1122         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1123
1124         spin_lock_irqsave(&iommu->lock, flags);
1125         iommu->root_entry = root;
1126         spin_unlock_irqrestore(&iommu->lock, flags);
1127
1128         return 0;
1129 }
1130
1131 static void iommu_set_root_entry(struct intel_iommu *iommu)
1132 {
1133         void *addr;
1134         u32 sts;
1135         unsigned long flag;
1136
1137         addr = iommu->root_entry;
1138
1139         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1140         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1141
1142         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1143
1144         /* Make sure hardware complete it */
1145         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146                       readl, (sts & DMA_GSTS_RTPS), sts);
1147
1148         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150
1151 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1152 {
1153         u32 val;
1154         unsigned long flag;
1155
1156         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1157                 return;
1158
1159         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1160         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1161
1162         /* Make sure hardware complete it */
1163         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1164                       readl, (!(val & DMA_GSTS_WBFS)), val);
1165
1166         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1167 }
1168
1169 /* return value determine if we need a write buffer flush */
1170 static void __iommu_flush_context(struct intel_iommu *iommu,
1171                                   u16 did, u16 source_id, u8 function_mask,
1172                                   u64 type)
1173 {
1174         u64 val = 0;
1175         unsigned long flag;
1176
1177         switch (type) {
1178         case DMA_CCMD_GLOBAL_INVL:
1179                 val = DMA_CCMD_GLOBAL_INVL;
1180                 break;
1181         case DMA_CCMD_DOMAIN_INVL:
1182                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1183                 break;
1184         case DMA_CCMD_DEVICE_INVL:
1185                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1186                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1187                 break;
1188         default:
1189                 BUG();
1190         }
1191         val |= DMA_CCMD_ICC;
1192
1193         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1194         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1195
1196         /* Make sure hardware complete it */
1197         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1198                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1199
1200         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1201 }
1202
1203 /* return value determine if we need a write buffer flush */
1204 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1205                                 u64 addr, unsigned int size_order, u64 type)
1206 {
1207         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1208         u64 val = 0, val_iva = 0;
1209         unsigned long flag;
1210
1211         switch (type) {
1212         case DMA_TLB_GLOBAL_FLUSH:
1213                 /* global flush doesn't need set IVA_REG */
1214                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1215                 break;
1216         case DMA_TLB_DSI_FLUSH:
1217                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1218                 break;
1219         case DMA_TLB_PSI_FLUSH:
1220                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1221                 /* IH bit is passed in as part of address */
1222                 val_iva = size_order | addr;
1223                 break;
1224         default:
1225                 BUG();
1226         }
1227         /* Note: set drain read/write */
1228 #if 0
1229         /*
1230          * This is probably to be super secure.. Looks like we can
1231          * ignore it without any impact.
1232          */
1233         if (cap_read_drain(iommu->cap))
1234                 val |= DMA_TLB_READ_DRAIN;
1235 #endif
1236         if (cap_write_drain(iommu->cap))
1237                 val |= DMA_TLB_WRITE_DRAIN;
1238
1239         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1240         /* Note: Only uses first TLB reg currently */
1241         if (val_iva)
1242                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1243         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1244
1245         /* Make sure hardware complete it */
1246         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1247                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1248
1249         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1250
1251         /* check IOTLB invalidation granularity */
1252         if (DMA_TLB_IAIG(val) == 0)
1253                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1254         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1255                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1256                         (unsigned long long)DMA_TLB_IIRG(type),
1257                         (unsigned long long)DMA_TLB_IAIG(val));
1258 }
1259
1260 static struct device_domain_info *
1261 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1262                          u8 bus, u8 devfn)
1263 {
1264         int found = 0;
1265         unsigned long flags;
1266         struct device_domain_info *info;
1267         struct pci_dev *pdev;
1268
1269         if (!ecap_dev_iotlb_support(iommu->ecap))
1270                 return NULL;
1271
1272         if (!iommu->qi)
1273                 return NULL;
1274
1275         spin_lock_irqsave(&device_domain_lock, flags);
1276         list_for_each_entry(info, &domain->devices, link)
1277                 if (info->bus == bus && info->devfn == devfn) {
1278                         found = 1;
1279                         break;
1280                 }
1281         spin_unlock_irqrestore(&device_domain_lock, flags);
1282
1283         if (!found || !info->dev || !dev_is_pci(info->dev))
1284                 return NULL;
1285
1286         pdev = to_pci_dev(info->dev);
1287
1288         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1289                 return NULL;
1290
1291         if (!dmar_find_matched_atsr_unit(pdev))
1292                 return NULL;
1293
1294         return info;
1295 }
1296
1297 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1298 {
1299         if (!info || !dev_is_pci(info->dev))
1300                 return;
1301
1302         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1303 }
1304
1305 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1306 {
1307         if (!info->dev || !dev_is_pci(info->dev) ||
1308             !pci_ats_enabled(to_pci_dev(info->dev)))
1309                 return;
1310
1311         pci_disable_ats(to_pci_dev(info->dev));
1312 }
1313
1314 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1315                                   u64 addr, unsigned mask)
1316 {
1317         u16 sid, qdep;
1318         unsigned long flags;
1319         struct device_domain_info *info;
1320
1321         spin_lock_irqsave(&device_domain_lock, flags);
1322         list_for_each_entry(info, &domain->devices, link) {
1323                 struct pci_dev *pdev;
1324                 if (!info->dev || !dev_is_pci(info->dev))
1325                         continue;
1326
1327                 pdev = to_pci_dev(info->dev);
1328                 if (!pci_ats_enabled(pdev))
1329                         continue;
1330
1331                 sid = info->bus << 8 | info->devfn;
1332                 qdep = pci_ats_queue_depth(pdev);
1333                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1334         }
1335         spin_unlock_irqrestore(&device_domain_lock, flags);
1336 }
1337
1338 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1339                                   unsigned long pfn, unsigned int pages, int ih, int map)
1340 {
1341         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1342         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1343
1344         BUG_ON(pages == 0);
1345
1346         if (ih)
1347                 ih = 1 << 6;
1348         /*
1349          * Fallback to domain selective flush if no PSI support or the size is
1350          * too big.
1351          * PSI requires page size to be 2 ^ x, and the base address is naturally
1352          * aligned to the size
1353          */
1354         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1355                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1356                                                 DMA_TLB_DSI_FLUSH);
1357         else
1358                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1359                                                 DMA_TLB_PSI_FLUSH);
1360
1361         /*
1362          * In caching mode, changes of pages from non-present to present require
1363          * flush. However, device IOTLB doesn't need to be flushed in this case.
1364          */
1365         if (!cap_caching_mode(iommu->cap) || !map)
1366                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1367 }
1368
1369 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1370 {
1371         u32 pmen;
1372         unsigned long flags;
1373
1374         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1375         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1376         pmen &= ~DMA_PMEN_EPM;
1377         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1378
1379         /* wait for the protected region status bit to clear */
1380         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1381                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1382
1383         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1384 }
1385
1386 static int iommu_enable_translation(struct intel_iommu *iommu)
1387 {
1388         u32 sts;
1389         unsigned long flags;
1390
1391         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1392         iommu->gcmd |= DMA_GCMD_TE;
1393         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1394
1395         /* Make sure hardware complete it */
1396         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1397                       readl, (sts & DMA_GSTS_TES), sts);
1398
1399         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1400         return 0;
1401 }
1402
1403 static int iommu_disable_translation(struct intel_iommu *iommu)
1404 {
1405         u32 sts;
1406         unsigned long flag;
1407
1408         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1409         iommu->gcmd &= ~DMA_GCMD_TE;
1410         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1411
1412         /* Make sure hardware complete it */
1413         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1414                       readl, (!(sts & DMA_GSTS_TES)), sts);
1415
1416         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1417         return 0;
1418 }
1419
1420
1421 static int iommu_init_domains(struct intel_iommu *iommu)
1422 {
1423         unsigned long ndomains;
1424         unsigned long nlongs;
1425
1426         ndomains = cap_ndoms(iommu->cap);
1427         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1428                  iommu->seq_id, ndomains);
1429         nlongs = BITS_TO_LONGS(ndomains);
1430
1431         spin_lock_init(&iommu->lock);
1432
1433         /* TBD: there might be 64K domains,
1434          * consider other allocation for future chip
1435          */
1436         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1437         if (!iommu->domain_ids) {
1438                 pr_err("IOMMU%d: allocating domain id array failed\n",
1439                        iommu->seq_id);
1440                 return -ENOMEM;
1441         }
1442         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1443                         GFP_KERNEL);
1444         if (!iommu->domains) {
1445                 pr_err("IOMMU%d: allocating domain array failed\n",
1446                        iommu->seq_id);
1447                 kfree(iommu->domain_ids);
1448                 iommu->domain_ids = NULL;
1449                 return -ENOMEM;
1450         }
1451
1452         /*
1453          * if Caching mode is set, then invalid translations are tagged
1454          * with domainid 0. Hence we need to pre-allocate it.
1455          */
1456         if (cap_caching_mode(iommu->cap))
1457                 set_bit(0, iommu->domain_ids);
1458         return 0;
1459 }
1460
1461 static void free_dmar_iommu(struct intel_iommu *iommu)
1462 {
1463         struct dmar_domain *domain;
1464         int i, count;
1465         unsigned long flags;
1466
1467         if ((iommu->domains) && (iommu->domain_ids)) {
1468                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1469                         /*
1470                          * Domain id 0 is reserved for invalid translation
1471                          * if hardware supports caching mode.
1472                          */
1473                         if (cap_caching_mode(iommu->cap) && i == 0)
1474                                 continue;
1475
1476                         domain = iommu->domains[i];
1477                         clear_bit(i, iommu->domain_ids);
1478
1479                         spin_lock_irqsave(&domain->iommu_lock, flags);
1480                         count = --domain->iommu_count;
1481                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1482                         if (count == 0)
1483                                 domain_exit(domain);
1484                 }
1485         }
1486
1487         if (iommu->gcmd & DMA_GCMD_TE)
1488                 iommu_disable_translation(iommu);
1489
1490         kfree(iommu->domains);
1491         kfree(iommu->domain_ids);
1492         iommu->domains = NULL;
1493         iommu->domain_ids = NULL;
1494
1495         g_iommus[iommu->seq_id] = NULL;
1496
1497         /* free context mapping */
1498         free_context_table(iommu);
1499 }
1500
1501 static struct dmar_domain *alloc_domain(bool vm)
1502 {
1503         /* domain id for virtual machine, it won't be set in context */
1504         static atomic_t vm_domid = ATOMIC_INIT(0);
1505         struct dmar_domain *domain;
1506
1507         domain = alloc_domain_mem();
1508         if (!domain)
1509                 return NULL;
1510
1511         domain->nid = -1;
1512         domain->iommu_count = 0;
1513         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1514         domain->flags = 0;
1515         spin_lock_init(&domain->iommu_lock);
1516         INIT_LIST_HEAD(&domain->devices);
1517         if (vm) {
1518                 domain->id = atomic_inc_return(&vm_domid);
1519                 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1520         }
1521
1522         return domain;
1523 }
1524
1525 static int iommu_attach_domain(struct dmar_domain *domain,
1526                                struct intel_iommu *iommu)
1527 {
1528         int num;
1529         unsigned long ndomains;
1530         unsigned long flags;
1531
1532         ndomains = cap_ndoms(iommu->cap);
1533
1534         spin_lock_irqsave(&iommu->lock, flags);
1535
1536         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1537         if (num >= ndomains) {
1538                 spin_unlock_irqrestore(&iommu->lock, flags);
1539                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1540                 return -ENOMEM;
1541         }
1542
1543         domain->id = num;
1544         domain->iommu_count++;
1545         set_bit(num, iommu->domain_ids);
1546         set_bit(iommu->seq_id, domain->iommu_bmp);
1547         iommu->domains[num] = domain;
1548         spin_unlock_irqrestore(&iommu->lock, flags);
1549
1550         return 0;
1551 }
1552
1553 static void iommu_detach_domain(struct dmar_domain *domain,
1554                                 struct intel_iommu *iommu)
1555 {
1556         unsigned long flags;
1557         int num, ndomains;
1558
1559         spin_lock_irqsave(&iommu->lock, flags);
1560         ndomains = cap_ndoms(iommu->cap);
1561         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1562                 if (iommu->domains[num] == domain) {
1563                         clear_bit(num, iommu->domain_ids);
1564                         iommu->domains[num] = NULL;
1565                         break;
1566                 }
1567         }
1568         spin_unlock_irqrestore(&iommu->lock, flags);
1569 }
1570
1571 static struct iova_domain reserved_iova_list;
1572 static struct lock_class_key reserved_rbtree_key;
1573
1574 static int dmar_init_reserved_ranges(void)
1575 {
1576         struct pci_dev *pdev = NULL;
1577         struct iova *iova;
1578         int i;
1579
1580         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1581
1582         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1583                 &reserved_rbtree_key);
1584
1585         /* IOAPIC ranges shouldn't be accessed by DMA */
1586         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1587                 IOVA_PFN(IOAPIC_RANGE_END));
1588         if (!iova) {
1589                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1590                 return -ENODEV;
1591         }
1592
1593         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1594         for_each_pci_dev(pdev) {
1595                 struct resource *r;
1596
1597                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1598                         r = &pdev->resource[i];
1599                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1600                                 continue;
1601                         iova = reserve_iova(&reserved_iova_list,
1602                                             IOVA_PFN(r->start),
1603                                             IOVA_PFN(r->end));
1604                         if (!iova) {
1605                                 printk(KERN_ERR "Reserve iova failed\n");
1606                                 return -ENODEV;
1607                         }
1608                 }
1609         }
1610         return 0;
1611 }
1612
1613 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1614 {
1615         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1616 }
1617
1618 static inline int guestwidth_to_adjustwidth(int gaw)
1619 {
1620         int agaw;
1621         int r = (gaw - 12) % 9;
1622
1623         if (r == 0)
1624                 agaw = gaw;
1625         else
1626                 agaw = gaw + 9 - r;
1627         if (agaw > 64)
1628                 agaw = 64;
1629         return agaw;
1630 }
1631
1632 static int domain_init(struct dmar_domain *domain, int guest_width)
1633 {
1634         struct intel_iommu *iommu;
1635         int adjust_width, agaw;
1636         unsigned long sagaw;
1637
1638         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1639         domain_reserve_special_ranges(domain);
1640
1641         /* calculate AGAW */
1642         iommu = domain_get_iommu(domain);
1643         if (guest_width > cap_mgaw(iommu->cap))
1644                 guest_width = cap_mgaw(iommu->cap);
1645         domain->gaw = guest_width;
1646         adjust_width = guestwidth_to_adjustwidth(guest_width);
1647         agaw = width_to_agaw(adjust_width);
1648         sagaw = cap_sagaw(iommu->cap);
1649         if (!test_bit(agaw, &sagaw)) {
1650                 /* hardware doesn't support it, choose a bigger one */
1651                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1652                 agaw = find_next_bit(&sagaw, 5, agaw);
1653                 if (agaw >= 5)
1654                         return -ENODEV;
1655         }
1656         domain->agaw = agaw;
1657
1658         if (ecap_coherent(iommu->ecap))
1659                 domain->iommu_coherency = 1;
1660         else
1661                 domain->iommu_coherency = 0;
1662
1663         if (ecap_sc_support(iommu->ecap))
1664                 domain->iommu_snooping = 1;
1665         else
1666                 domain->iommu_snooping = 0;
1667
1668         if (intel_iommu_superpage)
1669                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1670         else
1671                 domain->iommu_superpage = 0;
1672
1673         domain->nid = iommu->node;
1674
1675         /* always allocate the top pgd */
1676         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1677         if (!domain->pgd)
1678                 return -ENOMEM;
1679         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1680         return 0;
1681 }
1682
1683 static void domain_exit(struct dmar_domain *domain)
1684 {
1685         struct dmar_drhd_unit *drhd;
1686         struct intel_iommu *iommu;
1687         struct page *freelist = NULL;
1688
1689         /* Domain 0 is reserved, so dont process it */
1690         if (!domain)
1691                 return;
1692
1693         /* Flush any lazy unmaps that may reference this domain */
1694         if (!intel_iommu_strict)
1695                 flush_unmaps_timeout(0);
1696
1697         /* remove associated devices */
1698         domain_remove_dev_info(domain);
1699
1700         /* destroy iovas */
1701         put_iova_domain(&domain->iovad);
1702
1703         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1704
1705         /* clear attached or cached domains */
1706         rcu_read_lock();
1707         for_each_active_iommu(iommu, drhd)
1708                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1709                     test_bit(iommu->seq_id, domain->iommu_bmp))
1710                         iommu_detach_domain(domain, iommu);
1711         rcu_read_unlock();
1712
1713         dma_free_pagelist(freelist);
1714
1715         free_domain_mem(domain);
1716 }
1717
1718 static int domain_context_mapping_one(struct dmar_domain *domain,
1719                                       struct intel_iommu *iommu,
1720                                       u8 bus, u8 devfn, int translation)
1721 {
1722         struct context_entry *context;
1723         unsigned long flags;
1724         struct dma_pte *pgd;
1725         unsigned long num;
1726         unsigned long ndomains;
1727         int id;
1728         int agaw;
1729         struct device_domain_info *info = NULL;
1730
1731         pr_debug("Set context mapping for %02x:%02x.%d\n",
1732                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1733
1734         BUG_ON(!domain->pgd);
1735         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1736                translation != CONTEXT_TT_MULTI_LEVEL);
1737
1738         context = device_to_context_entry(iommu, bus, devfn);
1739         if (!context)
1740                 return -ENOMEM;
1741         spin_lock_irqsave(&iommu->lock, flags);
1742         if (context_present(context)) {
1743                 spin_unlock_irqrestore(&iommu->lock, flags);
1744                 return 0;
1745         }
1746
1747         id = domain->id;
1748         pgd = domain->pgd;
1749
1750         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1751             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1752                 int found = 0;
1753
1754                 /* find an available domain id for this device in iommu */
1755                 ndomains = cap_ndoms(iommu->cap);
1756                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1757                         if (iommu->domains[num] == domain) {
1758                                 id = num;
1759                                 found = 1;
1760                                 break;
1761                         }
1762                 }
1763
1764                 if (found == 0) {
1765                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1766                         if (num >= ndomains) {
1767                                 spin_unlock_irqrestore(&iommu->lock, flags);
1768                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1769                                 return -EFAULT;
1770                         }
1771
1772                         set_bit(num, iommu->domain_ids);
1773                         iommu->domains[num] = domain;
1774                         id = num;
1775                 }
1776
1777                 /* Skip top levels of page tables for
1778                  * iommu which has less agaw than default.
1779                  * Unnecessary for PT mode.
1780                  */
1781                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1782                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1783                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1784                                 if (!dma_pte_present(pgd)) {
1785                                         spin_unlock_irqrestore(&iommu->lock, flags);
1786                                         return -ENOMEM;
1787                                 }
1788                         }
1789                 }
1790         }
1791
1792         context_set_domain_id(context, id);
1793
1794         if (translation != CONTEXT_TT_PASS_THROUGH) {
1795                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1796                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1797                                      CONTEXT_TT_MULTI_LEVEL;
1798         }
1799         /*
1800          * In pass through mode, AW must be programmed to indicate the largest
1801          * AGAW value supported by hardware. And ASR is ignored by hardware.
1802          */
1803         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1804                 context_set_address_width(context, iommu->msagaw);
1805         else {
1806                 context_set_address_root(context, virt_to_phys(pgd));
1807                 context_set_address_width(context, iommu->agaw);
1808         }
1809
1810         context_set_translation_type(context, translation);
1811         context_set_fault_enable(context);
1812         context_set_present(context);
1813         domain_flush_cache(domain, context, sizeof(*context));
1814
1815         /*
1816          * It's a non-present to present mapping. If hardware doesn't cache
1817          * non-present entry we only need to flush the write-buffer. If the
1818          * _does_ cache non-present entries, then it does so in the special
1819          * domain #0, which we have to flush:
1820          */
1821         if (cap_caching_mode(iommu->cap)) {
1822                 iommu->flush.flush_context(iommu, 0,
1823                                            (((u16)bus) << 8) | devfn,
1824                                            DMA_CCMD_MASK_NOBIT,
1825                                            DMA_CCMD_DEVICE_INVL);
1826                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1827         } else {
1828                 iommu_flush_write_buffer(iommu);
1829         }
1830         iommu_enable_dev_iotlb(info);
1831         spin_unlock_irqrestore(&iommu->lock, flags);
1832
1833         spin_lock_irqsave(&domain->iommu_lock, flags);
1834         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1835                 domain->iommu_count++;
1836                 if (domain->iommu_count == 1)
1837                         domain->nid = iommu->node;
1838                 domain_update_iommu_cap(domain);
1839         }
1840         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1841         return 0;
1842 }
1843
1844 static int
1845 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1846                        int translation)
1847 {
1848         int ret;
1849         struct pci_dev *pdev, *tmp, *parent;
1850         struct intel_iommu *iommu;
1851         u8 bus, devfn;
1852
1853         iommu = device_to_iommu(dev, &bus, &devfn);
1854         if (!iommu)
1855                 return -ENODEV;
1856
1857         ret = domain_context_mapping_one(domain, iommu, bus, devfn,
1858                                          translation);
1859         if (ret || !dev_is_pci(dev))
1860                 return ret;
1861
1862         /* dependent device mapping */
1863         pdev = to_pci_dev(dev);
1864         tmp = pci_find_upstream_pcie_bridge(pdev);
1865         if (!tmp)
1866                 return 0;
1867         /* Secondary interface's bus number and devfn 0 */
1868         parent = pdev->bus->self;
1869         while (parent != tmp) {
1870                 ret = domain_context_mapping_one(domain, iommu,
1871                                                  parent->bus->number,
1872                                                  parent->devfn, translation);
1873                 if (ret)
1874                         return ret;
1875                 parent = parent->bus->self;
1876         }
1877         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1878                 return domain_context_mapping_one(domain, iommu,
1879                                         tmp->subordinate->number, 0,
1880                                         translation);
1881         else /* this is a legacy PCI bridge */
1882                 return domain_context_mapping_one(domain, iommu,
1883                                                   tmp->bus->number,
1884                                                   tmp->devfn,
1885                                                   translation);
1886 }
1887
1888 static int domain_context_mapped(struct device *dev)
1889 {
1890         int ret;
1891         struct pci_dev *pdev, *tmp, *parent;
1892         struct intel_iommu *iommu;
1893         u8 bus, devfn;
1894
1895         iommu = device_to_iommu(dev, &bus, &devfn);
1896         if (!iommu)
1897                 return -ENODEV;
1898
1899         ret = device_context_mapped(iommu, bus, devfn);
1900         if (!ret || !dev_is_pci(dev))
1901                 return ret;
1902
1903         /* dependent device mapping */
1904         pdev = to_pci_dev(dev);
1905         tmp = pci_find_upstream_pcie_bridge(pdev);
1906         if (!tmp)
1907                 return ret;
1908         /* Secondary interface's bus number and devfn 0 */
1909         parent = pdev->bus->self;
1910         while (parent != tmp) {
1911                 ret = device_context_mapped(iommu, parent->bus->number,
1912                                             parent->devfn);
1913                 if (!ret)
1914                         return ret;
1915                 parent = parent->bus->self;
1916         }
1917         if (pci_is_pcie(tmp))
1918                 return device_context_mapped(iommu, tmp->subordinate->number,
1919                                              0);
1920         else
1921                 return device_context_mapped(iommu, tmp->bus->number,
1922                                              tmp->devfn);
1923 }
1924
1925 /* Returns a number of VTD pages, but aligned to MM page size */
1926 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1927                                             size_t size)
1928 {
1929         host_addr &= ~PAGE_MASK;
1930         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1931 }
1932
1933 /* Return largest possible superpage level for a given mapping */
1934 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1935                                           unsigned long iov_pfn,
1936                                           unsigned long phy_pfn,
1937                                           unsigned long pages)
1938 {
1939         int support, level = 1;
1940         unsigned long pfnmerge;
1941
1942         support = domain->iommu_superpage;
1943
1944         /* To use a large page, the virtual *and* physical addresses
1945            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1946            of them will mean we have to use smaller pages. So just
1947            merge them and check both at once. */
1948         pfnmerge = iov_pfn | phy_pfn;
1949
1950         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1951                 pages >>= VTD_STRIDE_SHIFT;
1952                 if (!pages)
1953                         break;
1954                 pfnmerge >>= VTD_STRIDE_SHIFT;
1955                 level++;
1956                 support--;
1957         }
1958         return level;
1959 }
1960
1961 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1962                             struct scatterlist *sg, unsigned long phys_pfn,
1963                             unsigned long nr_pages, int prot)
1964 {
1965         struct dma_pte *first_pte = NULL, *pte = NULL;
1966         phys_addr_t uninitialized_var(pteval);
1967         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1968         unsigned long sg_res;
1969         unsigned int largepage_lvl = 0;
1970         unsigned long lvl_pages = 0;
1971
1972         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1973
1974         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1975                 return -EINVAL;
1976
1977         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1978
1979         if (sg)
1980                 sg_res = 0;
1981         else {
1982                 sg_res = nr_pages + 1;
1983                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1984         }
1985
1986         while (nr_pages > 0) {
1987                 uint64_t tmp;
1988
1989                 if (!sg_res) {
1990                         sg_res = aligned_nrpages(sg->offset, sg->length);
1991                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1992                         sg->dma_length = sg->length;
1993                         pteval = page_to_phys(sg_page(sg)) | prot;
1994                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1995                 }
1996
1997                 if (!pte) {
1998                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1999
2000                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2001                         if (!pte)
2002                                 return -ENOMEM;
2003                         /* It is large page*/
2004                         if (largepage_lvl > 1) {
2005                                 pteval |= DMA_PTE_LARGE_PAGE;
2006                                 /* Ensure that old small page tables are removed to make room
2007                                    for superpage, if they exist. */
2008                                 dma_pte_clear_range(domain, iov_pfn,
2009                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
2010                                 dma_pte_free_pagetable(domain, iov_pfn,
2011                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
2012                         } else {
2013                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2014                         }
2015
2016                 }
2017                 /* We don't need lock here, nobody else
2018                  * touches the iova range
2019                  */
2020                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2021                 if (tmp) {
2022                         static int dumps = 5;
2023                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2024                                iov_pfn, tmp, (unsigned long long)pteval);
2025                         if (dumps) {
2026                                 dumps--;
2027                                 debug_dma_dump_mappings(NULL);
2028                         }
2029                         WARN_ON(1);
2030                 }
2031
2032                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2033
2034                 BUG_ON(nr_pages < lvl_pages);
2035                 BUG_ON(sg_res < lvl_pages);
2036
2037                 nr_pages -= lvl_pages;
2038                 iov_pfn += lvl_pages;
2039                 phys_pfn += lvl_pages;
2040                 pteval += lvl_pages * VTD_PAGE_SIZE;
2041                 sg_res -= lvl_pages;
2042
2043                 /* If the next PTE would be the first in a new page, then we
2044                    need to flush the cache on the entries we've just written.
2045                    And then we'll need to recalculate 'pte', so clear it and
2046                    let it get set again in the if (!pte) block above.
2047
2048                    If we're done (!nr_pages) we need to flush the cache too.
2049
2050                    Also if we've been setting superpages, we may need to
2051                    recalculate 'pte' and switch back to smaller pages for the
2052                    end of the mapping, if the trailing size is not enough to
2053                    use another superpage (i.e. sg_res < lvl_pages). */
2054                 pte++;
2055                 if (!nr_pages || first_pte_in_page(pte) ||
2056                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2057                         domain_flush_cache(domain, first_pte,
2058                                            (void *)pte - (void *)first_pte);
2059                         pte = NULL;
2060                 }
2061
2062                 if (!sg_res && nr_pages)
2063                         sg = sg_next(sg);
2064         }
2065         return 0;
2066 }
2067
2068 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2069                                     struct scatterlist *sg, unsigned long nr_pages,
2070                                     int prot)
2071 {
2072         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2073 }
2074
2075 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2076                                      unsigned long phys_pfn, unsigned long nr_pages,
2077                                      int prot)
2078 {
2079         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2080 }
2081
2082 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2083 {
2084         if (!iommu)
2085                 return;
2086
2087         clear_context_table(iommu, bus, devfn);
2088         iommu->flush.flush_context(iommu, 0, 0, 0,
2089                                            DMA_CCMD_GLOBAL_INVL);
2090         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2091 }
2092
2093 static inline void unlink_domain_info(struct device_domain_info *info)
2094 {
2095         assert_spin_locked(&device_domain_lock);
2096         list_del(&info->link);
2097         list_del(&info->global);
2098         if (info->dev)
2099                 info->dev->archdata.iommu = NULL;
2100 }
2101
2102 static void domain_remove_dev_info(struct dmar_domain *domain)
2103 {
2104         struct device_domain_info *info;
2105         unsigned long flags, flags2;
2106
2107         spin_lock_irqsave(&device_domain_lock, flags);
2108         while (!list_empty(&domain->devices)) {
2109                 info = list_entry(domain->devices.next,
2110                         struct device_domain_info, link);
2111                 unlink_domain_info(info);
2112                 spin_unlock_irqrestore(&device_domain_lock, flags);
2113
2114                 iommu_disable_dev_iotlb(info);
2115                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2116
2117                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2118                         iommu_detach_dependent_devices(info->iommu, info->dev);
2119                         /* clear this iommu in iommu_bmp, update iommu count
2120                          * and capabilities
2121                          */
2122                         spin_lock_irqsave(&domain->iommu_lock, flags2);
2123                         if (test_and_clear_bit(info->iommu->seq_id,
2124                                                domain->iommu_bmp)) {
2125                                 domain->iommu_count--;
2126                                 domain_update_iommu_cap(domain);
2127                         }
2128                         spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2129                 }
2130
2131                 free_devinfo_mem(info);
2132                 spin_lock_irqsave(&device_domain_lock, flags);
2133         }
2134         spin_unlock_irqrestore(&device_domain_lock, flags);
2135 }
2136
2137 /*
2138  * find_domain
2139  * Note: we use struct device->archdata.iommu stores the info
2140  */
2141 static struct dmar_domain *find_domain(struct device *dev)
2142 {
2143         struct device_domain_info *info;
2144
2145         /* No lock here, assumes no domain exit in normal case */
2146         info = dev->archdata.iommu;
2147         if (info)
2148                 return info->domain;
2149         return NULL;
2150 }
2151
2152 static inline struct device_domain_info *
2153 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2154 {
2155         struct device_domain_info *info;
2156
2157         list_for_each_entry(info, &device_domain_list, global)
2158                 if (info->iommu->segment == segment && info->bus == bus &&
2159                     info->devfn == devfn)
2160                         return info;
2161
2162         return NULL;
2163 }
2164
2165 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2166                                                 int bus, int devfn,
2167                                                 struct device *dev,
2168                                                 struct dmar_domain *domain)
2169 {
2170         struct dmar_domain *found = NULL;
2171         struct device_domain_info *info;
2172         unsigned long flags;
2173
2174         info = alloc_devinfo_mem();
2175         if (!info)
2176                 return NULL;
2177
2178         info->bus = bus;
2179         info->devfn = devfn;
2180         info->dev = dev;
2181         info->domain = domain;
2182         info->iommu = iommu;
2183         if (!dev)
2184                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2185
2186         spin_lock_irqsave(&device_domain_lock, flags);
2187         if (dev)
2188                 found = find_domain(dev);
2189         else {
2190                 struct device_domain_info *info2;
2191                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2192                 if (info2)
2193                         found = info2->domain;
2194         }
2195         if (found) {
2196                 spin_unlock_irqrestore(&device_domain_lock, flags);
2197                 free_devinfo_mem(info);
2198                 /* Caller must free the original domain */
2199                 return found;
2200         }
2201
2202         list_add(&info->link, &domain->devices);
2203         list_add(&info->global, &device_domain_list);
2204         if (dev)
2205                 dev->archdata.iommu = info;
2206         spin_unlock_irqrestore(&device_domain_lock, flags);
2207
2208         return domain;
2209 }
2210
2211 /* domain is initialized */
2212 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2213 {
2214         struct dmar_domain *domain, *free = NULL;
2215         struct intel_iommu *iommu = NULL;
2216         struct device_domain_info *info;
2217         struct pci_dev *dev_tmp = NULL;
2218         unsigned long flags;
2219         u8 bus, devfn, bridge_bus, bridge_devfn;
2220
2221         domain = find_domain(dev);
2222         if (domain)
2223                 return domain;
2224
2225         if (dev_is_pci(dev)) {
2226                 struct pci_dev *pdev = to_pci_dev(dev);
2227                 u16 segment;
2228
2229                 segment = pci_domain_nr(pdev->bus);
2230                 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2231                 if (dev_tmp) {
2232                         if (pci_is_pcie(dev_tmp)) {
2233                                 bridge_bus = dev_tmp->subordinate->number;
2234                                 bridge_devfn = 0;
2235                         } else {
2236                                 bridge_bus = dev_tmp->bus->number;
2237                                 bridge_devfn = dev_tmp->devfn;
2238                         }
2239                         spin_lock_irqsave(&device_domain_lock, flags);
2240                         info = dmar_search_domain_by_dev_info(segment, bus, devfn);
2241                         if (info) {
2242                                 iommu = info->iommu;
2243                                 domain = info->domain;
2244                         }
2245                         spin_unlock_irqrestore(&device_domain_lock, flags);
2246                         /* pcie-pci bridge already has a domain, uses it */
2247                         if (info)
2248                                 goto found_domain;
2249                 }
2250         }
2251
2252         iommu = device_to_iommu(dev, &bus, &devfn);
2253         if (!iommu)
2254                 goto error;
2255
2256         /* Allocate and initialize new domain for the device */
2257         domain = alloc_domain(false);
2258         if (!domain)
2259                 goto error;
2260         if (iommu_attach_domain(domain, iommu)) {
2261                 free_domain_mem(domain);
2262                 domain = NULL;
2263                 goto error;
2264         }
2265         free = domain;
2266         if (domain_init(domain, gaw))
2267                 goto error;
2268
2269         /* register pcie-to-pci device */
2270         if (dev_tmp) {
2271                 domain = dmar_insert_dev_info(iommu, bridge_bus, bridge_devfn,
2272                                               NULL, domain);
2273                 if (!domain)
2274                         goto error;
2275         }
2276
2277 found_domain:
2278         domain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2279 error:
2280         if (free != domain)
2281                 domain_exit(free);
2282
2283         return domain;
2284 }
2285
2286 static int iommu_identity_mapping;
2287 #define IDENTMAP_ALL            1
2288 #define IDENTMAP_GFX            2
2289 #define IDENTMAP_AZALIA         4
2290
2291 static int iommu_domain_identity_map(struct dmar_domain *domain,
2292                                      unsigned long long start,
2293                                      unsigned long long end)
2294 {
2295         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2296         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2297
2298         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2299                           dma_to_mm_pfn(last_vpfn))) {
2300                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2301                 return -ENOMEM;
2302         }
2303
2304         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2305                  start, end, domain->id);
2306         /*
2307          * RMRR range might have overlap with physical memory range,
2308          * clear it first
2309          */
2310         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2311
2312         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2313                                   last_vpfn - first_vpfn + 1,
2314                                   DMA_PTE_READ|DMA_PTE_WRITE);
2315 }
2316
2317 static int iommu_prepare_identity_map(struct device *dev,
2318                                       unsigned long long start,
2319                                       unsigned long long end)
2320 {
2321         struct dmar_domain *domain;
2322         int ret;
2323
2324         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2325         if (!domain)
2326                 return -ENOMEM;
2327
2328         /* For _hardware_ passthrough, don't bother. But for software
2329            passthrough, we do it anyway -- it may indicate a memory
2330            range which is reserved in E820, so which didn't get set
2331            up to start with in si_domain */
2332         if (domain == si_domain && hw_pass_through) {
2333                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2334                        dev_name(dev), start, end);
2335                 return 0;
2336         }
2337
2338         printk(KERN_INFO
2339                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2340                dev_name(dev), start, end);
2341         
2342         if (end < start) {
2343                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2344                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2345                         dmi_get_system_info(DMI_BIOS_VENDOR),
2346                         dmi_get_system_info(DMI_BIOS_VERSION),
2347                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2348                 ret = -EIO;
2349                 goto error;
2350         }
2351
2352         if (end >> agaw_to_width(domain->agaw)) {
2353                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2354                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2355                      agaw_to_width(domain->agaw),
2356                      dmi_get_system_info(DMI_BIOS_VENDOR),
2357                      dmi_get_system_info(DMI_BIOS_VERSION),
2358                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2359                 ret = -EIO;
2360                 goto error;
2361         }
2362
2363         ret = iommu_domain_identity_map(domain, start, end);
2364         if (ret)
2365                 goto error;
2366
2367         /* context entry init */
2368         ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2369         if (ret)
2370                 goto error;
2371
2372         return 0;
2373
2374  error:
2375         domain_exit(domain);
2376         return ret;
2377 }
2378
2379 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2380                                          struct device *dev)
2381 {
2382         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2383                 return 0;
2384         return iommu_prepare_identity_map(dev, rmrr->base_address,
2385                                           rmrr->end_address);
2386 }
2387
2388 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2389 static inline void iommu_prepare_isa(void)
2390 {
2391         struct pci_dev *pdev;
2392         int ret;
2393
2394         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2395         if (!pdev)
2396                 return;
2397
2398         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2399         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2400
2401         if (ret)
2402                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2403                        "floppy might not work\n");
2404
2405 }
2406 #else
2407 static inline void iommu_prepare_isa(void)
2408 {
2409         return;
2410 }
2411 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2412
2413 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2414
2415 static int __init si_domain_init(int hw)
2416 {
2417         struct dmar_drhd_unit *drhd;
2418         struct intel_iommu *iommu;
2419         int nid, ret = 0;
2420
2421         si_domain = alloc_domain(false);
2422         if (!si_domain)
2423                 return -EFAULT;
2424
2425         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2426
2427         for_each_active_iommu(iommu, drhd) {
2428                 ret = iommu_attach_domain(si_domain, iommu);
2429                 if (ret) {
2430                         domain_exit(si_domain);
2431                         return -EFAULT;
2432                 }
2433         }
2434
2435         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2436                 domain_exit(si_domain);
2437                 return -EFAULT;
2438         }
2439
2440         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2441                  si_domain->id);
2442
2443         if (hw)
2444                 return 0;
2445
2446         for_each_online_node(nid) {
2447                 unsigned long start_pfn, end_pfn;
2448                 int i;
2449
2450                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2451                         ret = iommu_domain_identity_map(si_domain,
2452                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2453                         if (ret)
2454                                 return ret;
2455                 }
2456         }
2457
2458         return 0;
2459 }
2460
2461 static int identity_mapping(struct device *dev)
2462 {
2463         struct device_domain_info *info;
2464
2465         if (likely(!iommu_identity_mapping))
2466                 return 0;
2467
2468         info = dev->archdata.iommu;
2469         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2470                 return (info->domain == si_domain);
2471
2472         return 0;
2473 }
2474
2475 static int domain_add_dev_info(struct dmar_domain *domain,
2476                                struct device *dev, int translation)
2477 {
2478         struct dmar_domain *ndomain;
2479         struct intel_iommu *iommu;
2480         u8 bus, devfn;
2481         int ret;
2482
2483         iommu = device_to_iommu(dev, &bus, &devfn);
2484         if (!iommu)
2485                 return -ENODEV;
2486
2487         ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2488         if (ndomain != domain)
2489                 return -EBUSY;
2490
2491         ret = domain_context_mapping(domain, dev, translation);
2492         if (ret) {
2493                 domain_remove_one_dev_info(domain, dev);
2494                 return ret;
2495         }
2496
2497         return 0;
2498 }
2499
2500 static bool device_has_rmrr(struct device *dev)
2501 {
2502         struct dmar_rmrr_unit *rmrr;
2503         struct device *tmp;
2504         int i;
2505
2506         rcu_read_lock();
2507         for_each_rmrr_units(rmrr) {
2508                 /*
2509                  * Return TRUE if this RMRR contains the device that
2510                  * is passed in.
2511                  */
2512                 for_each_active_dev_scope(rmrr->devices,
2513                                           rmrr->devices_cnt, i, tmp)
2514                         if (tmp == dev) {
2515                                 rcu_read_unlock();
2516                                 return true;
2517                         }
2518         }
2519         rcu_read_unlock();
2520         return false;
2521 }
2522
2523 static int iommu_should_identity_map(struct device *dev, int startup)
2524 {
2525
2526         if (dev_is_pci(dev)) {
2527                 struct pci_dev *pdev = to_pci_dev(dev);
2528
2529                 /*
2530                  * We want to prevent any device associated with an RMRR from
2531                  * getting placed into the SI Domain. This is done because
2532                  * problems exist when devices are moved in and out of domains
2533                  * and their respective RMRR info is lost. We exempt USB devices
2534                  * from this process due to their usage of RMRRs that are known
2535                  * to not be needed after BIOS hand-off to OS.
2536                  */
2537                 if (device_has_rmrr(dev) &&
2538                     (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2539                         return 0;
2540
2541                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2542                         return 1;
2543
2544                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2545                         return 1;
2546
2547                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2548                         return 0;
2549
2550                 /*
2551                  * We want to start off with all devices in the 1:1 domain, and
2552                  * take them out later if we find they can't access all of memory.
2553                  *
2554                  * However, we can't do this for PCI devices behind bridges,
2555                  * because all PCI devices behind the same bridge will end up
2556                  * with the same source-id on their transactions.
2557                  *
2558                  * Practically speaking, we can't change things around for these
2559                  * devices at run-time, because we can't be sure there'll be no
2560                  * DMA transactions in flight for any of their siblings.
2561                  *
2562                  * So PCI devices (unless they're on the root bus) as well as
2563                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2564                  * the 1:1 domain, just in _case_ one of their siblings turns out
2565                  * not to be able to map all of memory.
2566                  */
2567                 if (!pci_is_pcie(pdev)) {
2568                         if (!pci_is_root_bus(pdev->bus))
2569                                 return 0;
2570                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2571                                 return 0;
2572                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2573                         return 0;
2574         } else {
2575                 if (device_has_rmrr(dev))
2576                         return 0;
2577         }
2578
2579         /*
2580          * At boot time, we don't yet know if devices will be 64-bit capable.
2581          * Assume that they will — if they turn out not to be, then we can
2582          * take them out of the 1:1 domain later.
2583          */
2584         if (!startup) {
2585                 /*
2586                  * If the device's dma_mask is less than the system's memory
2587                  * size then this is not a candidate for identity mapping.
2588                  */
2589                 u64 dma_mask = *dev->dma_mask;
2590
2591                 if (dev->coherent_dma_mask &&
2592                     dev->coherent_dma_mask < dma_mask)
2593                         dma_mask = dev->coherent_dma_mask;
2594
2595                 return dma_mask >= dma_get_required_mask(dev);
2596         }
2597
2598         return 1;
2599 }
2600
2601 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2602 {
2603         int ret;
2604
2605         if (!iommu_should_identity_map(dev, 1))
2606                 return 0;
2607
2608         ret = domain_add_dev_info(si_domain, dev,
2609                                   hw ? CONTEXT_TT_PASS_THROUGH :
2610                                        CONTEXT_TT_MULTI_LEVEL);
2611         if (!ret)
2612                 pr_info("IOMMU: %s identity mapping for device %s\n",
2613                         hw ? "hardware" : "software", dev_name(dev));
2614         else if (ret == -ENODEV)
2615                 /* device not associated with an iommu */
2616                 ret = 0;
2617
2618         return ret;
2619 }
2620
2621
2622 static int __init iommu_prepare_static_identity_mapping(int hw)
2623 {
2624         struct pci_dev *pdev = NULL;
2625         struct dmar_drhd_unit *drhd;
2626         struct intel_iommu *iommu;
2627         struct device *dev;
2628         int i;
2629         int ret = 0;
2630
2631         ret = si_domain_init(hw);
2632         if (ret)
2633                 return -EFAULT;
2634
2635         for_each_pci_dev(pdev) {
2636                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2637                 if (ret)
2638                         return ret;
2639         }
2640
2641         for_each_active_iommu(iommu, drhd)
2642                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2643                         struct acpi_device_physical_node *pn;
2644                         struct acpi_device *adev;
2645
2646                         if (dev->bus != &acpi_bus_type)
2647                                 continue;
2648                                 
2649                         adev= to_acpi_device(dev);
2650                         mutex_lock(&adev->physical_node_lock);
2651                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2652                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2653                                 if (ret)
2654                                         break;
2655                         }
2656                         mutex_unlock(&adev->physical_node_lock);
2657                         if (ret)
2658                                 return ret;
2659                 }
2660
2661         return 0;
2662 }
2663
2664 static int __init init_dmars(void)
2665 {
2666         struct dmar_drhd_unit *drhd;
2667         struct dmar_rmrr_unit *rmrr;
2668         struct device *dev;
2669         struct intel_iommu *iommu;
2670         int i, ret;
2671
2672         /*
2673          * for each drhd
2674          *    allocate root
2675          *    initialize and program root entry to not present
2676          * endfor
2677          */
2678         for_each_drhd_unit(drhd) {
2679                 /*
2680                  * lock not needed as this is only incremented in the single
2681                  * threaded kernel __init code path all other access are read
2682                  * only
2683                  */
2684                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2685                         g_num_of_iommus++;
2686                         continue;
2687                 }
2688                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2689                           IOMMU_UNITS_SUPPORTED);
2690         }
2691
2692         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2693                         GFP_KERNEL);
2694         if (!g_iommus) {
2695                 printk(KERN_ERR "Allocating global iommu array failed\n");
2696                 ret = -ENOMEM;
2697                 goto error;
2698         }
2699
2700         deferred_flush = kzalloc(g_num_of_iommus *
2701                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2702         if (!deferred_flush) {
2703                 ret = -ENOMEM;
2704                 goto free_g_iommus;
2705         }
2706
2707         for_each_active_iommu(iommu, drhd) {
2708                 g_iommus[iommu->seq_id] = iommu;
2709
2710                 ret = iommu_init_domains(iommu);
2711                 if (ret)
2712                         goto free_iommu;
2713
2714                 /*
2715                  * TBD:
2716                  * we could share the same root & context tables
2717                  * among all IOMMU's. Need to Split it later.
2718                  */
2719                 ret = iommu_alloc_root_entry(iommu);
2720                 if (ret) {
2721                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2722                         goto free_iommu;
2723                 }
2724                 if (!ecap_pass_through(iommu->ecap))
2725                         hw_pass_through = 0;
2726         }
2727
2728         /*
2729          * Start from the sane iommu hardware state.
2730          */
2731         for_each_active_iommu(iommu, drhd) {
2732                 /*
2733                  * If the queued invalidation is already initialized by us
2734                  * (for example, while enabling interrupt-remapping) then
2735                  * we got the things already rolling from a sane state.
2736                  */
2737                 if (iommu->qi)
2738                         continue;
2739
2740                 /*
2741                  * Clear any previous faults.
2742                  */
2743                 dmar_fault(-1, iommu);
2744                 /*
2745                  * Disable queued invalidation if supported and already enabled
2746                  * before OS handover.
2747                  */
2748                 dmar_disable_qi(iommu);
2749         }
2750
2751         for_each_active_iommu(iommu, drhd) {
2752                 if (dmar_enable_qi(iommu)) {
2753                         /*
2754                          * Queued Invalidate not enabled, use Register Based
2755                          * Invalidate
2756                          */
2757                         iommu->flush.flush_context = __iommu_flush_context;
2758                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2759                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2760                                "invalidation\n",
2761                                 iommu->seq_id,
2762                                (unsigned long long)drhd->reg_base_addr);
2763                 } else {
2764                         iommu->flush.flush_context = qi_flush_context;
2765                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2766                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2767                                "invalidation\n",
2768                                 iommu->seq_id,
2769                                (unsigned long long)drhd->reg_base_addr);
2770                 }
2771         }
2772
2773         if (iommu_pass_through)
2774                 iommu_identity_mapping |= IDENTMAP_ALL;
2775
2776 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2777         iommu_identity_mapping |= IDENTMAP_GFX;
2778 #endif
2779
2780         check_tylersburg_isoch();
2781
2782         /*
2783          * If pass through is not set or not enabled, setup context entries for
2784          * identity mappings for rmrr, gfx, and isa and may fall back to static
2785          * identity mapping if iommu_identity_mapping is set.
2786          */
2787         if (iommu_identity_mapping) {
2788                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2789                 if (ret) {
2790                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2791                         goto free_iommu;
2792                 }
2793         }
2794         /*
2795          * For each rmrr
2796          *   for each dev attached to rmrr
2797          *   do
2798          *     locate drhd for dev, alloc domain for dev
2799          *     allocate free domain
2800          *     allocate page table entries for rmrr
2801          *     if context not allocated for bus
2802          *           allocate and init context
2803          *           set present in root table for this bus
2804          *     init context with domain, translation etc
2805          *    endfor
2806          * endfor
2807          */
2808         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2809         for_each_rmrr_units(rmrr) {
2810                 /* some BIOS lists non-exist devices in DMAR table. */
2811                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2812                                           i, dev) {
2813                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
2814                         if (ret)
2815                                 printk(KERN_ERR
2816                                        "IOMMU: mapping reserved region failed\n");
2817                 }
2818         }
2819
2820         iommu_prepare_isa();
2821
2822         /*
2823          * for each drhd
2824          *   enable fault log
2825          *   global invalidate context cache
2826          *   global invalidate iotlb
2827          *   enable translation
2828          */
2829         for_each_iommu(iommu, drhd) {
2830                 if (drhd->ignored) {
2831                         /*
2832                          * we always have to disable PMRs or DMA may fail on
2833                          * this device
2834                          */
2835                         if (force_on)
2836                                 iommu_disable_protect_mem_regions(iommu);
2837                         continue;
2838                 }
2839
2840                 iommu_flush_write_buffer(iommu);
2841
2842                 ret = dmar_set_interrupt(iommu);
2843                 if (ret)
2844                         goto free_iommu;
2845
2846                 iommu_set_root_entry(iommu);
2847
2848                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2849                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2850
2851                 ret = iommu_enable_translation(iommu);
2852                 if (ret)
2853                         goto free_iommu;
2854
2855                 iommu_disable_protect_mem_regions(iommu);
2856         }
2857
2858         return 0;
2859
2860 free_iommu:
2861         for_each_active_iommu(iommu, drhd)
2862                 free_dmar_iommu(iommu);
2863         kfree(deferred_flush);
2864 free_g_iommus:
2865         kfree(g_iommus);
2866 error:
2867         return ret;
2868 }
2869
2870 /* This takes a number of _MM_ pages, not VTD pages */
2871 static struct iova *intel_alloc_iova(struct device *dev,
2872                                      struct dmar_domain *domain,
2873                                      unsigned long nrpages, uint64_t dma_mask)
2874 {
2875         struct iova *iova = NULL;
2876
2877         /* Restrict dma_mask to the width that the iommu can handle */
2878         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2879
2880         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2881                 /*
2882                  * First try to allocate an io virtual address in
2883                  * DMA_BIT_MASK(32) and if that fails then try allocating
2884                  * from higher range
2885                  */
2886                 iova = alloc_iova(&domain->iovad, nrpages,
2887                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2888                 if (iova)
2889                         return iova;
2890         }
2891         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2892         if (unlikely(!iova)) {
2893                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2894                        nrpages, dev_name(dev));
2895                 return NULL;
2896         }
2897
2898         return iova;
2899 }
2900
2901 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2902 {
2903         struct dmar_domain *domain;
2904         int ret;
2905
2906         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2907         if (!domain) {
2908                 printk(KERN_ERR "Allocating domain for %s failed",
2909                        dev_name(dev));
2910                 return NULL;
2911         }
2912
2913         /* make sure context mapping is ok */
2914         if (unlikely(!domain_context_mapped(dev))) {
2915                 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2916                 if (ret) {
2917                         printk(KERN_ERR "Domain context map for %s failed",
2918                                dev_name(dev));
2919                         return NULL;
2920                 }
2921         }
2922
2923         return domain;
2924 }
2925
2926 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2927 {
2928         struct device_domain_info *info;
2929
2930         /* No lock here, assumes no domain exit in normal case */
2931         info = dev->archdata.iommu;
2932         if (likely(info))
2933                 return info->domain;
2934
2935         return __get_valid_domain_for_dev(dev);
2936 }
2937
2938 static int iommu_dummy(struct device *dev)
2939 {
2940         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2941 }
2942
2943 /* Check if the dev needs to go through non-identity map and unmap process.*/
2944 static int iommu_no_mapping(struct device *dev)
2945 {
2946         int found;
2947
2948         if (iommu_dummy(dev))
2949                 return 1;
2950
2951         if (!iommu_identity_mapping)
2952                 return 0;
2953
2954         found = identity_mapping(dev);
2955         if (found) {
2956                 if (iommu_should_identity_map(dev, 0))
2957                         return 1;
2958                 else {
2959                         /*
2960                          * 32 bit DMA is removed from si_domain and fall back
2961                          * to non-identity mapping.
2962                          */
2963                         domain_remove_one_dev_info(si_domain, dev);
2964                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2965                                dev_name(dev));
2966                         return 0;
2967                 }
2968         } else {
2969                 /*
2970                  * In case of a detached 64 bit DMA device from vm, the device
2971                  * is put into si_domain for identity mapping.
2972                  */
2973                 if (iommu_should_identity_map(dev, 0)) {
2974                         int ret;
2975                         ret = domain_add_dev_info(si_domain, dev,
2976                                                   hw_pass_through ?
2977                                                   CONTEXT_TT_PASS_THROUGH :
2978                                                   CONTEXT_TT_MULTI_LEVEL);
2979                         if (!ret) {
2980                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2981                                        dev_name(dev));
2982                                 return 1;
2983                         }
2984                 }
2985         }
2986
2987         return 0;
2988 }
2989
2990 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
2991                                      size_t size, int dir, u64 dma_mask)
2992 {
2993         struct dmar_domain *domain;
2994         phys_addr_t start_paddr;
2995         struct iova *iova;
2996         int prot = 0;
2997         int ret;
2998         struct intel_iommu *iommu;
2999         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3000
3001         BUG_ON(dir == DMA_NONE);
3002
3003         if (iommu_no_mapping(dev))
3004                 return paddr;
3005
3006         domain = get_valid_domain_for_dev(dev);
3007         if (!domain)
3008                 return 0;
3009
3010         iommu = domain_get_iommu(domain);
3011         size = aligned_nrpages(paddr, size);
3012
3013         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3014         if (!iova)
3015                 goto error;
3016
3017         /*
3018          * Check if DMAR supports zero-length reads on write only
3019          * mappings..
3020          */
3021         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3022                         !cap_zlr(iommu->cap))
3023                 prot |= DMA_PTE_READ;
3024         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3025                 prot |= DMA_PTE_WRITE;
3026         /*
3027          * paddr - (paddr + size) might be partial page, we should map the whole
3028          * page.  Note: if two part of one page are separately mapped, we
3029          * might have two guest_addr mapping to the same host paddr, but this
3030          * is not a big problem
3031          */
3032         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3033                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3034         if (ret)
3035                 goto error;
3036
3037         /* it's a non-present to present mapping. Only flush if caching mode */
3038         if (cap_caching_mode(iommu->cap))
3039                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3040         else
3041                 iommu_flush_write_buffer(iommu);
3042
3043         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3044         start_paddr += paddr & ~PAGE_MASK;
3045         return start_paddr;
3046
3047 error:
3048         if (iova)
3049                 __free_iova(&domain->iovad, iova);
3050         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3051                 dev_name(dev), size, (unsigned long long)paddr, dir);
3052         return 0;
3053 }
3054
3055 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3056                                  unsigned long offset, size_t size,
3057                                  enum dma_data_direction dir,
3058                                  struct dma_attrs *attrs)
3059 {
3060         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3061                                   dir, *dev->dma_mask);
3062 }
3063
3064 static void flush_unmaps(void)
3065 {
3066         int i, j;
3067
3068         timer_on = 0;
3069
3070         /* just flush them all */
3071         for (i = 0; i < g_num_of_iommus; i++) {
3072                 struct intel_iommu *iommu = g_iommus[i];
3073                 if (!iommu)
3074                         continue;
3075
3076                 if (!deferred_flush[i].next)
3077                         continue;
3078
3079                 /* In caching mode, global flushes turn emulation expensive */
3080                 if (!cap_caching_mode(iommu->cap))
3081                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3082                                          DMA_TLB_GLOBAL_FLUSH);
3083                 for (j = 0; j < deferred_flush[i].next; j++) {
3084                         unsigned long mask;
3085                         struct iova *iova = deferred_flush[i].iova[j];
3086                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3087
3088                         /* On real hardware multiple invalidations are expensive */
3089                         if (cap_caching_mode(iommu->cap))
3090                                 iommu_flush_iotlb_psi(iommu, domain->id,
3091                                         iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3092                                         !deferred_flush[i].freelist[j], 0);
3093                         else {
3094                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3095                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3096                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3097                         }
3098                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3099                         if (deferred_flush[i].freelist[j])
3100                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3101                 }
3102                 deferred_flush[i].next = 0;
3103         }
3104
3105         list_size = 0;
3106 }
3107
3108 static void flush_unmaps_timeout(unsigned long data)
3109 {
3110         unsigned long flags;
3111
3112         spin_lock_irqsave(&async_umap_flush_lock, flags);
3113         flush_unmaps();
3114         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3115 }
3116
3117 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3118 {
3119         unsigned long flags;
3120         int next, iommu_id;
3121         struct intel_iommu *iommu;
3122
3123         spin_lock_irqsave(&async_umap_flush_lock, flags);
3124         if (list_size == HIGH_WATER_MARK)
3125                 flush_unmaps();
3126
3127         iommu = domain_get_iommu(dom);
3128         iommu_id = iommu->seq_id;
3129
3130         next = deferred_flush[iommu_id].next;
3131         deferred_flush[iommu_id].domain[next] = dom;
3132         deferred_flush[iommu_id].iova[next] = iova;
3133         deferred_flush[iommu_id].freelist[next] = freelist;
3134         deferred_flush[iommu_id].next++;
3135
3136         if (!timer_on) {
3137                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3138                 timer_on = 1;
3139         }
3140         list_size++;
3141         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3142 }
3143
3144 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3145                              size_t size, enum dma_data_direction dir,
3146                              struct dma_attrs *attrs)
3147 {
3148         struct dmar_domain *domain;
3149         unsigned long start_pfn, last_pfn;
3150         struct iova *iova;
3151         struct intel_iommu *iommu;
3152         struct page *freelist;
3153
3154         if (iommu_no_mapping(dev))
3155                 return;
3156
3157         domain = find_domain(dev);
3158         BUG_ON(!domain);
3159
3160         iommu = domain_get_iommu(domain);
3161
3162         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3163         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3164                       (unsigned long long)dev_addr))
3165                 return;
3166
3167         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3168         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3169
3170         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3171                  dev_name(dev), start_pfn, last_pfn);
3172
3173         freelist = domain_unmap(domain, start_pfn, last_pfn);
3174
3175         if (intel_iommu_strict) {
3176                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3177                                       last_pfn - start_pfn + 1, !freelist, 0);
3178                 /* free iova */
3179                 __free_iova(&domain->iovad, iova);
3180                 dma_free_pagelist(freelist);
3181         } else {
3182                 add_unmap(domain, iova, freelist);
3183                 /*
3184                  * queue up the release of the unmap to save the 1/6th of the
3185                  * cpu used up by the iotlb flush operation...
3186                  */
3187         }
3188 }
3189
3190 static void *intel_alloc_coherent(struct device *dev, size_t size,
3191                                   dma_addr_t *dma_handle, gfp_t flags,
3192                                   struct dma_attrs *attrs)
3193 {
3194         void *vaddr;
3195         int order;
3196
3197         size = PAGE_ALIGN(size);
3198         order = get_order(size);
3199
3200         if (!iommu_no_mapping(dev))
3201                 flags &= ~(GFP_DMA | GFP_DMA32);
3202         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3203                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3204                         flags |= GFP_DMA;
3205                 else
3206                         flags |= GFP_DMA32;
3207         }
3208
3209         vaddr = (void *)__get_free_pages(flags, order);
3210         if (!vaddr)
3211                 return NULL;
3212         memset(vaddr, 0, size);
3213
3214         *dma_handle = __intel_map_single(dev, virt_to_bus(vaddr), size,
3215                                          DMA_BIDIRECTIONAL,
3216                                          dev->coherent_dma_mask);
3217         if (*dma_handle)
3218                 return vaddr;
3219         free_pages((unsigned long)vaddr, order);
3220         return NULL;
3221 }
3222
3223 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3224                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3225 {
3226         int order;
3227
3228         size = PAGE_ALIGN(size);
3229         order = get_order(size);
3230
3231         intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3232         free_pages((unsigned long)vaddr, order);
3233 }
3234
3235 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3236                            int nelems, enum dma_data_direction dir,
3237                            struct dma_attrs *attrs)
3238 {
3239         struct dmar_domain *domain;
3240         unsigned long start_pfn, last_pfn;
3241         struct iova *iova;
3242         struct intel_iommu *iommu;
3243         struct page *freelist;
3244
3245         if (iommu_no_mapping(dev))
3246                 return;
3247
3248         domain = find_domain(dev);
3249         BUG_ON(!domain);
3250
3251         iommu = domain_get_iommu(domain);
3252
3253         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3254         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3255                       (unsigned long long)sglist[0].dma_address))
3256                 return;
3257
3258         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3259         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3260
3261         freelist = domain_unmap(domain, start_pfn, last_pfn);
3262
3263         if (intel_iommu_strict) {
3264                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3265                                       last_pfn - start_pfn + 1, !freelist, 0);
3266                 /* free iova */
3267                 __free_iova(&domain->iovad, iova);
3268                 dma_free_pagelist(freelist);
3269         } else {
3270                 add_unmap(domain, iova, freelist);
3271                 /*
3272                  * queue up the release of the unmap to save the 1/6th of the
3273                  * cpu used up by the iotlb flush operation...
3274                  */
3275         }
3276 }
3277
3278 static int intel_nontranslate_map_sg(struct device *hddev,
3279         struct scatterlist *sglist, int nelems, int dir)
3280 {
3281         int i;
3282         struct scatterlist *sg;
3283
3284         for_each_sg(sglist, sg, nelems, i) {
3285                 BUG_ON(!sg_page(sg));
3286                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3287                 sg->dma_length = sg->length;
3288         }
3289         return nelems;
3290 }
3291
3292 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3293                         enum dma_data_direction dir, struct dma_attrs *attrs)
3294 {
3295         int i;
3296         struct dmar_domain *domain;
3297         size_t size = 0;
3298         int prot = 0;
3299         struct iova *iova = NULL;
3300         int ret;
3301         struct scatterlist *sg;
3302         unsigned long start_vpfn;
3303         struct intel_iommu *iommu;
3304
3305         BUG_ON(dir == DMA_NONE);
3306         if (iommu_no_mapping(dev))
3307                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3308
3309         domain = get_valid_domain_for_dev(dev);
3310         if (!domain)
3311                 return 0;
3312
3313         iommu = domain_get_iommu(domain);
3314
3315         for_each_sg(sglist, sg, nelems, i)
3316                 size += aligned_nrpages(sg->offset, sg->length);
3317
3318         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3319                                 *dev->dma_mask);
3320         if (!iova) {
3321                 sglist->dma_length = 0;
3322                 return 0;
3323         }
3324
3325         /*
3326          * Check if DMAR supports zero-length reads on write only
3327          * mappings..
3328          */
3329         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3330                         !cap_zlr(iommu->cap))
3331                 prot |= DMA_PTE_READ;
3332         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3333                 prot |= DMA_PTE_WRITE;
3334
3335         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3336
3337         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3338         if (unlikely(ret)) {
3339                 /*  clear the page */
3340                 dma_pte_clear_range(domain, start_vpfn,
3341                                     start_vpfn + size - 1);
3342                 /* free page tables */
3343                 dma_pte_free_pagetable(domain, start_vpfn,
3344                                        start_vpfn + size - 1);
3345                 /* free iova */
3346                 __free_iova(&domain->iovad, iova);
3347                 return 0;
3348         }
3349
3350         /* it's a non-present to present mapping. Only flush if caching mode */
3351         if (cap_caching_mode(iommu->cap))
3352                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3353         else
3354                 iommu_flush_write_buffer(iommu);
3355
3356         return nelems;
3357 }
3358
3359 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3360 {
3361         return !dma_addr;
3362 }
3363
3364 struct dma_map_ops intel_dma_ops = {
3365         .alloc = intel_alloc_coherent,
3366         .free = intel_free_coherent,
3367         .map_sg = intel_map_sg,
3368         .unmap_sg = intel_unmap_sg,
3369         .map_page = intel_map_page,
3370         .unmap_page = intel_unmap_page,
3371         .mapping_error = intel_mapping_error,
3372 };
3373
3374 static inline int iommu_domain_cache_init(void)
3375 {
3376         int ret = 0;
3377
3378         iommu_domain_cache = kmem_cache_create("iommu_domain",
3379                                          sizeof(struct dmar_domain),
3380                                          0,
3381                                          SLAB_HWCACHE_ALIGN,
3382
3383                                          NULL);
3384         if (!iommu_domain_cache) {
3385                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3386                 ret = -ENOMEM;
3387         }
3388
3389         return ret;
3390 }
3391
3392 static inline int iommu_devinfo_cache_init(void)
3393 {
3394         int ret = 0;
3395
3396         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3397                                          sizeof(struct device_domain_info),
3398                                          0,
3399                                          SLAB_HWCACHE_ALIGN,
3400                                          NULL);
3401         if (!iommu_devinfo_cache) {
3402                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3403                 ret = -ENOMEM;
3404         }
3405
3406         return ret;
3407 }
3408
3409 static inline int iommu_iova_cache_init(void)
3410 {
3411         int ret = 0;
3412
3413         iommu_iova_cache = kmem_cache_create("iommu_iova",
3414                                          sizeof(struct iova),
3415                                          0,
3416                                          SLAB_HWCACHE_ALIGN,
3417                                          NULL);
3418         if (!iommu_iova_cache) {
3419                 printk(KERN_ERR "Couldn't create iova cache\n");
3420                 ret = -ENOMEM;
3421         }
3422
3423         return ret;
3424 }
3425
3426 static int __init iommu_init_mempool(void)
3427 {
3428         int ret;
3429         ret = iommu_iova_cache_init();
3430         if (ret)
3431                 return ret;
3432
3433         ret = iommu_domain_cache_init();
3434         if (ret)
3435                 goto domain_error;
3436
3437         ret = iommu_devinfo_cache_init();
3438         if (!ret)
3439                 return ret;
3440
3441         kmem_cache_destroy(iommu_domain_cache);
3442 domain_error:
3443         kmem_cache_destroy(iommu_iova_cache);
3444
3445         return -ENOMEM;
3446 }
3447
3448 static void __init iommu_exit_mempool(void)
3449 {
3450         kmem_cache_destroy(iommu_devinfo_cache);
3451         kmem_cache_destroy(iommu_domain_cache);
3452         kmem_cache_destroy(iommu_iova_cache);
3453
3454 }
3455
3456 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3457 {
3458         struct dmar_drhd_unit *drhd;
3459         u32 vtbar;
3460         int rc;
3461
3462         /* We know that this device on this chipset has its own IOMMU.
3463          * If we find it under a different IOMMU, then the BIOS is lying
3464          * to us. Hope that the IOMMU for this device is actually
3465          * disabled, and it needs no translation...
3466          */
3467         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3468         if (rc) {
3469                 /* "can't" happen */
3470                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3471                 return;
3472         }
3473         vtbar &= 0xffff0000;
3474
3475         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3476         drhd = dmar_find_matched_drhd_unit(pdev);
3477         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3478                             TAINT_FIRMWARE_WORKAROUND,
3479                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3480                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3481 }
3482 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3483
3484 static void __init init_no_remapping_devices(void)
3485 {
3486         struct dmar_drhd_unit *drhd;
3487         struct device *dev;
3488         int i;
3489
3490         for_each_drhd_unit(drhd) {
3491                 if (!drhd->include_all) {
3492                         for_each_active_dev_scope(drhd->devices,
3493                                                   drhd->devices_cnt, i, dev)
3494                                 break;
3495                         /* ignore DMAR unit if no devices exist */
3496                         if (i == drhd->devices_cnt)
3497                                 drhd->ignored = 1;
3498                 }
3499         }
3500
3501         for_each_active_drhd_unit(drhd) {
3502                 if (drhd->include_all)
3503                         continue;
3504
3505                 for_each_active_dev_scope(drhd->devices,
3506                                           drhd->devices_cnt, i, dev)
3507                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3508                                 break;
3509                 if (i < drhd->devices_cnt)
3510                         continue;
3511
3512                 /* This IOMMU has *only* gfx devices. Either bypass it or
3513                    set the gfx_mapped flag, as appropriate */
3514                 if (dmar_map_gfx) {
3515                         intel_iommu_gfx_mapped = 1;
3516                 } else {
3517                         drhd->ignored = 1;
3518                         for_each_active_dev_scope(drhd->devices,
3519                                                   drhd->devices_cnt, i, dev)
3520                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3521                 }
3522         }
3523 }
3524
3525 #ifdef CONFIG_SUSPEND
3526 static int init_iommu_hw(void)
3527 {
3528         struct dmar_drhd_unit *drhd;
3529         struct intel_iommu *iommu = NULL;
3530
3531         for_each_active_iommu(iommu, drhd)
3532                 if (iommu->qi)
3533                         dmar_reenable_qi(iommu);
3534
3535         for_each_iommu(iommu, drhd) {
3536                 if (drhd->ignored) {
3537                         /*
3538                          * we always have to disable PMRs or DMA may fail on
3539                          * this device
3540                          */
3541                         if (force_on)
3542                                 iommu_disable_protect_mem_regions(iommu);
3543                         continue;
3544                 }
3545         
3546                 iommu_flush_write_buffer(iommu);
3547
3548                 iommu_set_root_entry(iommu);
3549
3550                 iommu->flush.flush_context(iommu, 0, 0, 0,
3551                                            DMA_CCMD_GLOBAL_INVL);
3552                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3553                                          DMA_TLB_GLOBAL_FLUSH);
3554                 if (iommu_enable_translation(iommu))
3555                         return 1;
3556                 iommu_disable_protect_mem_regions(iommu);
3557         }
3558
3559         return 0;
3560 }
3561
3562 static void iommu_flush_all(void)
3563 {
3564         struct dmar_drhd_unit *drhd;
3565         struct intel_iommu *iommu;
3566
3567         for_each_active_iommu(iommu, drhd) {
3568                 iommu->flush.flush_context(iommu, 0, 0, 0,
3569                                            DMA_CCMD_GLOBAL_INVL);
3570                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3571                                          DMA_TLB_GLOBAL_FLUSH);
3572         }
3573 }
3574
3575 static int iommu_suspend(void)
3576 {
3577         struct dmar_drhd_unit *drhd;
3578         struct intel_iommu *iommu = NULL;
3579         unsigned long flag;
3580
3581         for_each_active_iommu(iommu, drhd) {
3582                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3583                                                  GFP_ATOMIC);
3584                 if (!iommu->iommu_state)
3585                         goto nomem;
3586         }
3587
3588         iommu_flush_all();
3589
3590         for_each_active_iommu(iommu, drhd) {
3591                 iommu_disable_translation(iommu);
3592
3593                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3594
3595                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3596                         readl(iommu->reg + DMAR_FECTL_REG);
3597                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3598                         readl(iommu->reg + DMAR_FEDATA_REG);
3599                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3600                         readl(iommu->reg + DMAR_FEADDR_REG);
3601                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3602                         readl(iommu->reg + DMAR_FEUADDR_REG);
3603
3604                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3605         }
3606         return 0;
3607
3608 nomem:
3609         for_each_active_iommu(iommu, drhd)
3610                 kfree(iommu->iommu_state);
3611
3612         return -ENOMEM;
3613 }
3614
3615 static void iommu_resume(void)
3616 {
3617         struct dmar_drhd_unit *drhd;
3618         struct intel_iommu *iommu = NULL;
3619         unsigned long flag;
3620
3621         if (init_iommu_hw()) {
3622                 if (force_on)
3623                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3624                 else
3625                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3626                 return;
3627         }
3628
3629         for_each_active_iommu(iommu, drhd) {
3630
3631                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3632
3633                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3634                         iommu->reg + DMAR_FECTL_REG);
3635                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3636                         iommu->reg + DMAR_FEDATA_REG);
3637                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3638                         iommu->reg + DMAR_FEADDR_REG);
3639                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3640                         iommu->reg + DMAR_FEUADDR_REG);
3641
3642                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3643         }
3644
3645         for_each_active_iommu(iommu, drhd)
3646                 kfree(iommu->iommu_state);
3647 }
3648
3649 static struct syscore_ops iommu_syscore_ops = {
3650         .resume         = iommu_resume,
3651         .suspend        = iommu_suspend,
3652 };
3653
3654 static void __init init_iommu_pm_ops(void)
3655 {
3656         register_syscore_ops(&iommu_syscore_ops);
3657 }
3658
3659 #else
3660 static inline void init_iommu_pm_ops(void) {}
3661 #endif  /* CONFIG_PM */
3662
3663
3664 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3665 {
3666         struct acpi_dmar_reserved_memory *rmrr;
3667         struct dmar_rmrr_unit *rmrru;
3668
3669         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3670         if (!rmrru)
3671                 return -ENOMEM;
3672
3673         rmrru->hdr = header;
3674         rmrr = (struct acpi_dmar_reserved_memory *)header;
3675         rmrru->base_address = rmrr->base_address;
3676         rmrru->end_address = rmrr->end_address;
3677         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3678                                 ((void *)rmrr) + rmrr->header.length,
3679                                 &rmrru->devices_cnt);
3680         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3681                 kfree(rmrru);
3682                 return -ENOMEM;
3683         }
3684
3685         list_add(&rmrru->list, &dmar_rmrr_units);
3686
3687         return 0;
3688 }
3689
3690 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3691 {
3692         struct acpi_dmar_atsr *atsr;
3693         struct dmar_atsr_unit *atsru;
3694
3695         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3696         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3697         if (!atsru)
3698                 return -ENOMEM;
3699
3700         atsru->hdr = hdr;
3701         atsru->include_all = atsr->flags & 0x1;
3702         if (!atsru->include_all) {
3703                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3704                                 (void *)atsr + atsr->header.length,
3705                                 &atsru->devices_cnt);
3706                 if (atsru->devices_cnt && atsru->devices == NULL) {
3707                         kfree(atsru);
3708                         return -ENOMEM;
3709                 }
3710         }
3711
3712         list_add_rcu(&atsru->list, &dmar_atsr_units);
3713
3714         return 0;
3715 }
3716
3717 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3718 {
3719         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3720         kfree(atsru);
3721 }
3722
3723 static void intel_iommu_free_dmars(void)
3724 {
3725         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3726         struct dmar_atsr_unit *atsru, *atsr_n;
3727
3728         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3729                 list_del(&rmrru->list);
3730                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3731                 kfree(rmrru);
3732         }
3733
3734         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3735                 list_del(&atsru->list);
3736                 intel_iommu_free_atsr(atsru);
3737         }
3738 }
3739
3740 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3741 {
3742         int i, ret = 1;
3743         struct pci_bus *bus;
3744         struct pci_dev *bridge = NULL;
3745         struct device *tmp;
3746         struct acpi_dmar_atsr *atsr;
3747         struct dmar_atsr_unit *atsru;
3748
3749         dev = pci_physfn(dev);
3750         for (bus = dev->bus; bus; bus = bus->parent) {
3751                 bridge = bus->self;
3752                 if (!bridge || !pci_is_pcie(bridge) ||
3753                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3754                         return 0;
3755                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3756                         break;
3757         }
3758         if (!bridge)
3759                 return 0;
3760
3761         rcu_read_lock();
3762         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3763                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3764                 if (atsr->segment != pci_domain_nr(dev->bus))
3765                         continue;
3766
3767                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3768                         if (tmp == &bridge->dev)
3769                                 goto out;
3770
3771                 if (atsru->include_all)
3772                         goto out;
3773         }
3774         ret = 0;
3775 out:
3776         rcu_read_unlock();
3777
3778         return ret;
3779 }
3780
3781 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3782 {
3783         int ret = 0;
3784         struct dmar_rmrr_unit *rmrru;
3785         struct dmar_atsr_unit *atsru;
3786         struct acpi_dmar_atsr *atsr;
3787         struct acpi_dmar_reserved_memory *rmrr;
3788
3789         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3790                 return 0;
3791
3792         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3793                 rmrr = container_of(rmrru->hdr,
3794                                     struct acpi_dmar_reserved_memory, header);
3795                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3796                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3797                                 ((void *)rmrr) + rmrr->header.length,
3798                                 rmrr->segment, rmrru->devices,
3799                                 rmrru->devices_cnt);
3800                         if (ret > 0)
3801                                 break;
3802                         else if(ret < 0)
3803                                 return ret;
3804                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3805                         if (dmar_remove_dev_scope(info, rmrr->segment,
3806                                 rmrru->devices, rmrru->devices_cnt))
3807                                 break;
3808                 }
3809         }
3810
3811         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3812                 if (atsru->include_all)
3813                         continue;
3814
3815                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3816                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3817                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3818                                         (void *)atsr + atsr->header.length,
3819                                         atsr->segment, atsru->devices,
3820                                         atsru->devices_cnt);
3821                         if (ret > 0)
3822                                 break;
3823                         else if(ret < 0)
3824                                 return ret;
3825                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3826                         if (dmar_remove_dev_scope(info, atsr->segment,
3827                                         atsru->devices, atsru->devices_cnt))
3828                                 break;
3829                 }
3830         }
3831
3832         return 0;
3833 }
3834
3835 /*
3836  * Here we only respond to action of unbound device from driver.
3837  *
3838  * Added device is not attached to its DMAR domain here yet. That will happen
3839  * when mapping the device to iova.
3840  */
3841 static int device_notifier(struct notifier_block *nb,
3842                                   unsigned long action, void *data)
3843 {
3844         struct device *dev = data;
3845         struct dmar_domain *domain;
3846
3847         if (iommu_dummy(dev))
3848                 return 0;
3849
3850         if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3851             action != BUS_NOTIFY_DEL_DEVICE)
3852                 return 0;
3853
3854         domain = find_domain(dev);
3855         if (!domain)
3856                 return 0;
3857
3858         down_read(&dmar_global_lock);
3859         domain_remove_one_dev_info(domain, dev);
3860         if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3861             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3862             list_empty(&domain->devices))
3863                 domain_exit(domain);
3864         up_read(&dmar_global_lock);
3865
3866         return 0;
3867 }
3868
3869 static struct notifier_block device_nb = {
3870         .notifier_call = device_notifier,
3871 };
3872
3873 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3874                                        unsigned long val, void *v)
3875 {
3876         struct memory_notify *mhp = v;
3877         unsigned long long start, end;
3878         unsigned long start_vpfn, last_vpfn;
3879
3880         switch (val) {
3881         case MEM_GOING_ONLINE:
3882                 start = mhp->start_pfn << PAGE_SHIFT;
3883                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3884                 if (iommu_domain_identity_map(si_domain, start, end)) {
3885                         pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3886                                 start, end);
3887                         return NOTIFY_BAD;
3888                 }
3889                 break;
3890
3891         case MEM_OFFLINE:
3892         case MEM_CANCEL_ONLINE:
3893                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3894                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3895                 while (start_vpfn <= last_vpfn) {
3896                         struct iova *iova;
3897                         struct dmar_drhd_unit *drhd;
3898                         struct intel_iommu *iommu;
3899                         struct page *freelist;
3900
3901                         iova = find_iova(&si_domain->iovad, start_vpfn);
3902                         if (iova == NULL) {
3903                                 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3904                                          start_vpfn);
3905                                 break;
3906                         }
3907
3908                         iova = split_and_remove_iova(&si_domain->iovad, iova,
3909                                                      start_vpfn, last_vpfn);
3910                         if (iova == NULL) {
3911                                 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3912                                         start_vpfn, last_vpfn);
3913                                 return NOTIFY_BAD;
3914                         }
3915
3916                         freelist = domain_unmap(si_domain, iova->pfn_lo,
3917                                                iova->pfn_hi);
3918
3919                         rcu_read_lock();
3920                         for_each_active_iommu(iommu, drhd)
3921                                 iommu_flush_iotlb_psi(iommu, si_domain->id,
3922                                         iova->pfn_lo,
3923                                         iova->pfn_hi - iova->pfn_lo + 1,
3924                                         !freelist, 0);
3925                         rcu_read_unlock();
3926                         dma_free_pagelist(freelist);
3927
3928                         start_vpfn = iova->pfn_hi + 1;
3929                         free_iova_mem(iova);
3930                 }
3931                 break;
3932         }
3933
3934         return NOTIFY_OK;
3935 }
3936
3937 static struct notifier_block intel_iommu_memory_nb = {
3938         .notifier_call = intel_iommu_memory_notifier,
3939         .priority = 0
3940 };
3941
3942 int __init intel_iommu_init(void)
3943 {
3944         int ret = -ENODEV;
3945         struct dmar_drhd_unit *drhd;
3946         struct intel_iommu *iommu;
3947
3948         /* VT-d is required for a TXT/tboot launch, so enforce that */
3949         force_on = tboot_force_iommu();
3950
3951         if (iommu_init_mempool()) {
3952                 if (force_on)
3953                         panic("tboot: Failed to initialize iommu memory\n");
3954                 return -ENOMEM;
3955         }
3956
3957         down_write(&dmar_global_lock);
3958         if (dmar_table_init()) {
3959                 if (force_on)
3960                         panic("tboot: Failed to initialize DMAR table\n");
3961                 goto out_free_dmar;
3962         }
3963
3964         /*
3965          * Disable translation if already enabled prior to OS handover.
3966          */
3967         for_each_active_iommu(iommu, drhd)
3968                 if (iommu->gcmd & DMA_GCMD_TE)
3969                         iommu_disable_translation(iommu);
3970
3971         if (dmar_dev_scope_init() < 0) {
3972                 if (force_on)
3973                         panic("tboot: Failed to initialize DMAR device scope\n");
3974                 goto out_free_dmar;
3975         }
3976
3977         if (no_iommu || dmar_disabled)
3978                 goto out_free_dmar;
3979
3980         if (list_empty(&dmar_rmrr_units))
3981                 printk(KERN_INFO "DMAR: No RMRR found\n");
3982
3983         if (list_empty(&dmar_atsr_units))
3984                 printk(KERN_INFO "DMAR: No ATSR found\n");
3985
3986         if (dmar_init_reserved_ranges()) {
3987                 if (force_on)
3988                         panic("tboot: Failed to reserve iommu ranges\n");
3989                 goto out_free_reserved_range;
3990         }
3991
3992         init_no_remapping_devices();
3993
3994         ret = init_dmars();
3995         if (ret) {
3996                 if (force_on)
3997                         panic("tboot: Failed to initialize DMARs\n");
3998                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3999                 goto out_free_reserved_range;
4000         }
4001         up_write(&dmar_global_lock);
4002         printk(KERN_INFO
4003         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4004
4005         init_timer(&unmap_timer);
4006 #ifdef CONFIG_SWIOTLB
4007         swiotlb = 0;
4008 #endif
4009         dma_ops = &intel_dma_ops;
4010
4011         init_iommu_pm_ops();
4012
4013         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4014         bus_register_notifier(&pci_bus_type, &device_nb);
4015         if (si_domain && !hw_pass_through)
4016                 register_memory_notifier(&intel_iommu_memory_nb);
4017
4018         intel_iommu_enabled = 1;
4019
4020         return 0;
4021
4022 out_free_reserved_range:
4023         put_iova_domain(&reserved_iova_list);
4024 out_free_dmar:
4025         intel_iommu_free_dmars();
4026         up_write(&dmar_global_lock);
4027         iommu_exit_mempool();
4028         return ret;
4029 }
4030
4031 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4032                                            struct device *dev)
4033 {
4034         struct pci_dev *tmp, *parent, *pdev;
4035
4036         if (!iommu || !dev || !dev_is_pci(dev))
4037                 return;
4038
4039         pdev = to_pci_dev(dev);
4040
4041         /* dependent device detach */
4042         tmp = pci_find_upstream_pcie_bridge(pdev);
4043         /* Secondary interface's bus number and devfn 0 */
4044         if (tmp) {
4045                 parent = pdev->bus->self;
4046                 while (parent != tmp) {
4047                         iommu_detach_dev(iommu, parent->bus->number,
4048                                          parent->devfn);
4049                         parent = parent->bus->self;
4050                 }
4051                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4052                         iommu_detach_dev(iommu,
4053                                 tmp->subordinate->number, 0);
4054                 else /* this is a legacy PCI bridge */
4055                         iommu_detach_dev(iommu, tmp->bus->number,
4056                                          tmp->devfn);
4057         }
4058 }
4059
4060 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4061                                        struct device *dev)
4062 {
4063         struct device_domain_info *info, *tmp;
4064         struct intel_iommu *iommu;
4065         unsigned long flags;
4066         int found = 0;
4067         u8 bus, devfn;
4068
4069         iommu = device_to_iommu(dev, &bus, &devfn);
4070         if (!iommu)
4071                 return;
4072
4073         spin_lock_irqsave(&device_domain_lock, flags);
4074         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4075                 if (info->iommu == iommu && info->bus == bus &&
4076                     info->devfn == devfn) {
4077                         unlink_domain_info(info);
4078                         spin_unlock_irqrestore(&device_domain_lock, flags);
4079
4080                         iommu_disable_dev_iotlb(info);
4081                         iommu_detach_dev(iommu, info->bus, info->devfn);
4082                         iommu_detach_dependent_devices(iommu, dev);
4083                         free_devinfo_mem(info);
4084
4085                         spin_lock_irqsave(&device_domain_lock, flags);
4086
4087                         if (found)
4088                                 break;
4089                         else
4090                                 continue;
4091                 }
4092
4093                 /* if there is no other devices under the same iommu
4094                  * owned by this domain, clear this iommu in iommu_bmp
4095                  * update iommu count and coherency
4096                  */
4097                 if (info->iommu == iommu)
4098                         found = 1;
4099         }
4100
4101         spin_unlock_irqrestore(&device_domain_lock, flags);
4102
4103         if (found == 0) {
4104                 unsigned long tmp_flags;
4105                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4106                 clear_bit(iommu->seq_id, domain->iommu_bmp);
4107                 domain->iommu_count--;
4108                 domain_update_iommu_cap(domain);
4109                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4110
4111                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4112                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4113                         spin_lock_irqsave(&iommu->lock, tmp_flags);
4114                         clear_bit(domain->id, iommu->domain_ids);
4115                         iommu->domains[domain->id] = NULL;
4116                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4117                 }
4118         }
4119 }
4120
4121 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4122 {
4123         int adjust_width;
4124
4125         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4126         domain_reserve_special_ranges(domain);
4127
4128         /* calculate AGAW */
4129         domain->gaw = guest_width;
4130         adjust_width = guestwidth_to_adjustwidth(guest_width);
4131         domain->agaw = width_to_agaw(adjust_width);
4132
4133         domain->iommu_coherency = 0;
4134         domain->iommu_snooping = 0;
4135         domain->iommu_superpage = 0;
4136         domain->max_addr = 0;
4137         domain->nid = -1;
4138
4139         /* always allocate the top pgd */
4140         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4141         if (!domain->pgd)
4142                 return -ENOMEM;
4143         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4144         return 0;
4145 }
4146
4147 static int intel_iommu_domain_init(struct iommu_domain *domain)
4148 {
4149         struct dmar_domain *dmar_domain;
4150
4151         dmar_domain = alloc_domain(true);
4152         if (!dmar_domain) {
4153                 printk(KERN_ERR
4154                         "intel_iommu_domain_init: dmar_domain == NULL\n");
4155                 return -ENOMEM;
4156         }
4157         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4158                 printk(KERN_ERR
4159                         "intel_iommu_domain_init() failed\n");
4160                 domain_exit(dmar_domain);
4161                 return -ENOMEM;
4162         }
4163         domain_update_iommu_cap(dmar_domain);
4164         domain->priv = dmar_domain;
4165
4166         domain->geometry.aperture_start = 0;
4167         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4168         domain->geometry.force_aperture = true;
4169
4170         return 0;
4171 }
4172
4173 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4174 {
4175         struct dmar_domain *dmar_domain = domain->priv;
4176
4177         domain->priv = NULL;
4178         domain_exit(dmar_domain);
4179 }
4180
4181 static int intel_iommu_attach_device(struct iommu_domain *domain,
4182                                      struct device *dev)
4183 {
4184         struct dmar_domain *dmar_domain = domain->priv;
4185         struct intel_iommu *iommu;
4186         int addr_width;
4187         u8 bus, devfn;
4188
4189         /* normally dev is not mapped */
4190         if (unlikely(domain_context_mapped(dev))) {
4191                 struct dmar_domain *old_domain;
4192
4193                 old_domain = find_domain(dev);
4194                 if (old_domain) {
4195                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4196                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4197                                 domain_remove_one_dev_info(old_domain, dev);
4198                         else
4199                                 domain_remove_dev_info(old_domain);
4200                 }
4201         }
4202
4203         iommu = device_to_iommu(dev, &bus, &devfn);
4204         if (!iommu)
4205                 return -ENODEV;
4206
4207         /* check if this iommu agaw is sufficient for max mapped address */
4208         addr_width = agaw_to_width(iommu->agaw);
4209         if (addr_width > cap_mgaw(iommu->cap))
4210                 addr_width = cap_mgaw(iommu->cap);
4211
4212         if (dmar_domain->max_addr > (1LL << addr_width)) {
4213                 printk(KERN_ERR "%s: iommu width (%d) is not "
4214                        "sufficient for the mapped address (%llx)\n",
4215                        __func__, addr_width, dmar_domain->max_addr);
4216                 return -EFAULT;
4217         }
4218         dmar_domain->gaw = addr_width;
4219
4220         /*
4221          * Knock out extra levels of page tables if necessary
4222          */
4223         while (iommu->agaw < dmar_domain->agaw) {
4224                 struct dma_pte *pte;
4225
4226                 pte = dmar_domain->pgd;
4227                 if (dma_pte_present(pte)) {
4228                         dmar_domain->pgd = (struct dma_pte *)
4229                                 phys_to_virt(dma_pte_addr(pte));
4230                         free_pgtable_page(pte);
4231                 }
4232                 dmar_domain->agaw--;
4233         }
4234
4235         return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4236 }
4237
4238 static void intel_iommu_detach_device(struct iommu_domain *domain,
4239                                       struct device *dev)
4240 {
4241         struct dmar_domain *dmar_domain = domain->priv;
4242
4243         domain_remove_one_dev_info(dmar_domain, dev);
4244 }
4245
4246 static int intel_iommu_map(struct iommu_domain *domain,
4247                            unsigned long iova, phys_addr_t hpa,
4248                            size_t size, int iommu_prot)
4249 {
4250         struct dmar_domain *dmar_domain = domain->priv;
4251         u64 max_addr;
4252         int prot = 0;
4253         int ret;
4254
4255         if (iommu_prot & IOMMU_READ)
4256                 prot |= DMA_PTE_READ;
4257         if (iommu_prot & IOMMU_WRITE)
4258                 prot |= DMA_PTE_WRITE;
4259         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4260                 prot |= DMA_PTE_SNP;
4261
4262         max_addr = iova + size;
4263         if (dmar_domain->max_addr < max_addr) {
4264                 u64 end;
4265
4266                 /* check if minimum agaw is sufficient for mapped address */
4267                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4268                 if (end < max_addr) {
4269                         printk(KERN_ERR "%s: iommu width (%d) is not "
4270                                "sufficient for the mapped address (%llx)\n",
4271                                __func__, dmar_domain->gaw, max_addr);
4272                         return -EFAULT;
4273                 }
4274                 dmar_domain->max_addr = max_addr;
4275         }
4276         /* Round up size to next multiple of PAGE_SIZE, if it and
4277            the low bits of hpa would take us onto the next page */
4278         size = aligned_nrpages(hpa, size);
4279         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4280                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4281         return ret;
4282 }
4283
4284 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4285                                 unsigned long iova, size_t size)
4286 {
4287         struct dmar_domain *dmar_domain = domain->priv;
4288         struct page *freelist = NULL;
4289         struct intel_iommu *iommu;
4290         unsigned long start_pfn, last_pfn;
4291         unsigned int npages;
4292         int iommu_id, num, ndomains, level = 0;
4293
4294         /* Cope with horrid API which requires us to unmap more than the
4295            size argument if it happens to be a large-page mapping. */
4296         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4297                 BUG();
4298
4299         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4300                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4301
4302         start_pfn = iova >> VTD_PAGE_SHIFT;
4303         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4304
4305         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4306
4307         npages = last_pfn - start_pfn + 1;
4308
4309         for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4310                iommu = g_iommus[iommu_id];
4311
4312                /*
4313                 * find bit position of dmar_domain
4314                 */
4315                ndomains = cap_ndoms(iommu->cap);
4316                for_each_set_bit(num, iommu->domain_ids, ndomains) {
4317                        if (iommu->domains[num] == dmar_domain)
4318                                iommu_flush_iotlb_psi(iommu, num, start_pfn,
4319                                                      npages, !freelist, 0);
4320                }
4321
4322         }
4323
4324         dma_free_pagelist(freelist);
4325
4326         if (dmar_domain->max_addr == iova + size)
4327                 dmar_domain->max_addr = iova;
4328
4329         return size;
4330 }
4331
4332 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4333                                             dma_addr_t iova)
4334 {
4335         struct dmar_domain *dmar_domain = domain->priv;
4336         struct dma_pte *pte;
4337         int level = 0;
4338         u64 phys = 0;
4339
4340         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4341         if (pte)
4342                 phys = dma_pte_addr(pte);
4343
4344         return phys;
4345 }
4346
4347 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4348                                       unsigned long cap)
4349 {
4350         struct dmar_domain *dmar_domain = domain->priv;
4351
4352         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4353                 return dmar_domain->iommu_snooping;
4354         if (cap == IOMMU_CAP_INTR_REMAP)
4355                 return irq_remapping_enabled;
4356
4357         return 0;
4358 }
4359
4360 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4361
4362 static int intel_iommu_add_device(struct device *dev)
4363 {
4364         struct pci_dev *pdev = to_pci_dev(dev);
4365         struct pci_dev *bridge, *dma_pdev = NULL;
4366         struct iommu_group *group;
4367         int ret;
4368         u8 bus, devfn;
4369
4370         if (!device_to_iommu(dev, &bus, &devfn))
4371                 return -ENODEV;
4372
4373         bridge = pci_find_upstream_pcie_bridge(pdev);
4374         if (bridge) {
4375                 if (pci_is_pcie(bridge))
4376                         dma_pdev = pci_get_domain_bus_and_slot(
4377                                                 pci_domain_nr(pdev->bus),
4378                                                 bridge->subordinate->number, 0);
4379                 if (!dma_pdev)
4380                         dma_pdev = pci_dev_get(bridge);
4381         } else
4382                 dma_pdev = pci_dev_get(pdev);
4383
4384         /* Account for quirked devices */
4385         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4386
4387         /*
4388          * If it's a multifunction device that does not support our
4389          * required ACS flags, add to the same group as lowest numbered
4390          * function that also does not suport the required ACS flags.
4391          */
4392         if (dma_pdev->multifunction &&
4393             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4394                 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4395
4396                 for (i = 0; i < 8; i++) {
4397                         struct pci_dev *tmp;
4398
4399                         tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4400                         if (!tmp)
4401                                 continue;
4402
4403                         if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4404                                 swap_pci_ref(&dma_pdev, tmp);
4405                                 break;
4406                         }
4407                         pci_dev_put(tmp);
4408                 }
4409         }
4410
4411         /*
4412          * Devices on the root bus go through the iommu.  If that's not us,
4413          * find the next upstream device and test ACS up to the root bus.
4414          * Finding the next device may require skipping virtual buses.
4415          */
4416         while (!pci_is_root_bus(dma_pdev->bus)) {
4417                 struct pci_bus *bus = dma_pdev->bus;
4418
4419                 while (!bus->self) {
4420                         if (!pci_is_root_bus(bus))
4421                                 bus = bus->parent;
4422                         else
4423                                 goto root_bus;
4424                 }
4425
4426                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4427                         break;
4428
4429                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4430         }
4431
4432 root_bus:
4433         group = iommu_group_get(&dma_pdev->dev);
4434         pci_dev_put(dma_pdev);
4435         if (!group) {
4436                 group = iommu_group_alloc();
4437                 if (IS_ERR(group))
4438                         return PTR_ERR(group);
4439         }
4440
4441         ret = iommu_group_add_device(group, dev);
4442
4443         iommu_group_put(group);
4444         return ret;
4445 }
4446
4447 static void intel_iommu_remove_device(struct device *dev)
4448 {
4449         iommu_group_remove_device(dev);
4450 }
4451
4452 static struct iommu_ops intel_iommu_ops = {
4453         .domain_init    = intel_iommu_domain_init,
4454         .domain_destroy = intel_iommu_domain_destroy,
4455         .attach_dev     = intel_iommu_attach_device,
4456         .detach_dev     = intel_iommu_detach_device,
4457         .map            = intel_iommu_map,
4458         .unmap          = intel_iommu_unmap,
4459         .iova_to_phys   = intel_iommu_iova_to_phys,
4460         .domain_has_cap = intel_iommu_domain_has_cap,
4461         .add_device     = intel_iommu_add_device,
4462         .remove_device  = intel_iommu_remove_device,
4463         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4464 };
4465
4466 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4467 {
4468         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4469         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4470         dmar_map_gfx = 0;
4471 }
4472
4473 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4480
4481 static void quirk_iommu_rwbf(struct pci_dev *dev)
4482 {
4483         /*
4484          * Mobile 4 Series Chipset neglects to set RWBF capability,
4485          * but needs it. Same seems to hold for the desktop versions.
4486          */
4487         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4488         rwbf_quirk = 1;
4489 }
4490
4491 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4492 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4493 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4494 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4495 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4497 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4498
4499 #define GGC 0x52
4500 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4501 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4502 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4503 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4504 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4505 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4506 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4507 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4508
4509 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4510 {
4511         unsigned short ggc;
4512
4513         if (pci_read_config_word(dev, GGC, &ggc))
4514                 return;
4515
4516         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4517                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4518                 dmar_map_gfx = 0;
4519         } else if (dmar_map_gfx) {
4520                 /* we have to ensure the gfx device is idle before we flush */
4521                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4522                 intel_iommu_strict = 1;
4523        }
4524 }
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4526 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4527 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4528 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4529
4530 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4531    ISOCH DMAR unit for the Azalia sound device, but not give it any
4532    TLB entries, which causes it to deadlock. Check for that.  We do
4533    this in a function called from init_dmars(), instead of in a PCI
4534    quirk, because we don't want to print the obnoxious "BIOS broken"
4535    message if VT-d is actually disabled.
4536 */
4537 static void __init check_tylersburg_isoch(void)
4538 {
4539         struct pci_dev *pdev;
4540         uint32_t vtisochctrl;
4541
4542         /* If there's no Azalia in the system anyway, forget it. */
4543         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4544         if (!pdev)
4545                 return;
4546         pci_dev_put(pdev);
4547
4548         /* System Management Registers. Might be hidden, in which case
4549            we can't do the sanity check. But that's OK, because the
4550            known-broken BIOSes _don't_ actually hide it, so far. */
4551         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4552         if (!pdev)
4553                 return;
4554
4555         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4556                 pci_dev_put(pdev);
4557                 return;
4558         }
4559
4560         pci_dev_put(pdev);
4561
4562         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4563         if (vtisochctrl & 1)
4564                 return;
4565
4566         /* Drop all bits other than the number of TLB entries */
4567         vtisochctrl &= 0x1c;
4568
4569         /* If we have the recommended number of TLB entries (16), fine. */
4570         if (vtisochctrl == 0x10)
4571                 return;
4572
4573         /* Zero TLB entries? You get to ride the short bus to school. */
4574         if (!vtisochctrl) {
4575                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4576                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4577                      dmi_get_system_info(DMI_BIOS_VENDOR),
4578                      dmi_get_system_info(DMI_BIOS_VERSION),
4579                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4580                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4581                 return;
4582         }
4583         
4584         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4585                vtisochctrl);
4586 }