iommu/vt-d: Replace iommu_bmp with a refcount
[firefly-linux-kernel-4.4.55.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
50
51 #include "irq_remapping.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
85
86 /* page table handling */
87 #define LEVEL_STRIDE            (9)
88 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
89
90 /*
91  * This bitmap is used to advertise the page sizes our hardware support
92  * to the IOMMU core, which will then use this information to split
93  * physically contiguous memory regions it is mapping into page sizes
94  * that we support.
95  *
96  * Traditionally the IOMMU core just handed us the mappings directly,
97  * after making sure the size is an order of a 4KiB page and that the
98  * mapping has natural alignment.
99  *
100  * To retain this behavior, we currently advertise that we support
101  * all page sizes that are an order of 4KiB.
102  *
103  * If at some point we'd like to utilize the IOMMU core's new behavior,
104  * we could change this to advertise the real page sizes we support.
105  */
106 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
107
108 static inline int agaw_to_level(int agaw)
109 {
110         return agaw + 2;
111 }
112
113 static inline int agaw_to_width(int agaw)
114 {
115         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 }
117
118 static inline int width_to_agaw(int width)
119 {
120         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 }
122
123 static inline unsigned int level_to_offset_bits(int level)
124 {
125         return (level - 1) * LEVEL_STRIDE;
126 }
127
128 static inline int pfn_level_offset(unsigned long pfn, int level)
129 {
130         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 }
132
133 static inline unsigned long level_mask(int level)
134 {
135         return -1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long level_size(int level)
139 {
140         return 1UL << level_to_offset_bits(level);
141 }
142
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
144 {
145         return (pfn + level_size(level) - 1) & level_mask(level);
146 }
147
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
149 {
150         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 }
152
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154    are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
156 {
157         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
161 {
162         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
163 }
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
165 {
166         return mm_to_dma_pfn(page_to_pfn(pg));
167 }
168 static inline unsigned long virt_to_dma_pfn(void *p)
169 {
170         return page_to_dma_pfn(virt_to_page(p));
171 }
172
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
175
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
178
179 /*
180  * set to 1 to panic kernel if can't successfully enable VT-d
181  * (used when kernel is launched w/ TXT)
182  */
183 static int force_on = 0;
184
185 /*
186  * 0: Present
187  * 1-11: Reserved
188  * 12-63: Context Ptr (12 - (haw-1))
189  * 64-127: Reserved
190  */
191 struct root_entry {
192         u64     lo;
193         u64     hi;
194 };
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
196
197 /*
198  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 {
203         if (!(re->lo & 1))
204                 return 0;
205
206         return re->lo & VTD_PAGE_MASK;
207 }
208
209 /*
210  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
211  * if marked present.
212  */
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 {
215         if (!(re->hi & 1))
216                 return 0;
217
218         return re->hi & VTD_PAGE_MASK;
219 }
220 /*
221  * low 64 bits:
222  * 0: present
223  * 1: fault processing disable
224  * 2-3: translation type
225  * 12-63: address space root
226  * high 64 bits:
227  * 0-2: address width
228  * 3-6: aval
229  * 8-23: domain id
230  */
231 struct context_entry {
232         u64 lo;
233         u64 hi;
234 };
235
236 static inline void context_clear_pasid_enable(struct context_entry *context)
237 {
238         context->lo &= ~(1ULL << 11);
239 }
240
241 static inline bool context_pasid_enabled(struct context_entry *context)
242 {
243         return !!(context->lo & (1ULL << 11));
244 }
245
246 static inline void context_set_copied(struct context_entry *context)
247 {
248         context->hi |= (1ull << 3);
249 }
250
251 static inline bool context_copied(struct context_entry *context)
252 {
253         return !!(context->hi & (1ULL << 3));
254 }
255
256 static inline bool __context_present(struct context_entry *context)
257 {
258         return (context->lo & 1);
259 }
260
261 static inline bool context_present(struct context_entry *context)
262 {
263         return context_pasid_enabled(context) ?
264              __context_present(context) :
265              __context_present(context) && !context_copied(context);
266 }
267
268 static inline void context_set_present(struct context_entry *context)
269 {
270         context->lo |= 1;
271 }
272
273 static inline void context_set_fault_enable(struct context_entry *context)
274 {
275         context->lo &= (((u64)-1) << 2) | 1;
276 }
277
278 static inline void context_set_translation_type(struct context_entry *context,
279                                                 unsigned long value)
280 {
281         context->lo &= (((u64)-1) << 4) | 3;
282         context->lo |= (value & 3) << 2;
283 }
284
285 static inline void context_set_address_root(struct context_entry *context,
286                                             unsigned long value)
287 {
288         context->lo &= ~VTD_PAGE_MASK;
289         context->lo |= value & VTD_PAGE_MASK;
290 }
291
292 static inline void context_set_address_width(struct context_entry *context,
293                                              unsigned long value)
294 {
295         context->hi |= value & 7;
296 }
297
298 static inline void context_set_domain_id(struct context_entry *context,
299                                          unsigned long value)
300 {
301         context->hi |= (value & ((1 << 16) - 1)) << 8;
302 }
303
304 static inline int context_domain_id(struct context_entry *c)
305 {
306         return((c->hi >> 8) & 0xffff);
307 }
308
309 static inline void context_clear_entry(struct context_entry *context)
310 {
311         context->lo = 0;
312         context->hi = 0;
313 }
314
315 /*
316  * 0: readable
317  * 1: writable
318  * 2-6: reserved
319  * 7: super page
320  * 8-10: available
321  * 11: snoop behavior
322  * 12-63: Host physcial address
323  */
324 struct dma_pte {
325         u64 val;
326 };
327
328 static inline void dma_clear_pte(struct dma_pte *pte)
329 {
330         pte->val = 0;
331 }
332
333 static inline u64 dma_pte_addr(struct dma_pte *pte)
334 {
335 #ifdef CONFIG_64BIT
336         return pte->val & VTD_PAGE_MASK;
337 #else
338         /* Must have a full atomic 64-bit read */
339         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
340 #endif
341 }
342
343 static inline bool dma_pte_present(struct dma_pte *pte)
344 {
345         return (pte->val & 3) != 0;
346 }
347
348 static inline bool dma_pte_superpage(struct dma_pte *pte)
349 {
350         return (pte->val & DMA_PTE_LARGE_PAGE);
351 }
352
353 static inline int first_pte_in_page(struct dma_pte *pte)
354 {
355         return !((unsigned long)pte & ~VTD_PAGE_MASK);
356 }
357
358 /*
359  * This domain is a statically identity mapping domain.
360  *      1. This domain creats a static 1:1 mapping to all usable memory.
361  *      2. It maps to each iommu if successful.
362  *      3. Each iommu mapps to this domain if successful.
363  */
364 static struct dmar_domain *si_domain;
365 static int hw_pass_through = 1;
366
367 /*
368  * Domain represents a virtual machine, more than one devices
369  * across iommus may be owned in one domain, e.g. kvm guest.
370  */
371 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
372
373 /* si_domain contains mulitple devices */
374 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
375
376 #define for_each_domain_iommu(idx, domain)                      \
377         for (idx = 0; idx < g_num_of_iommus; idx++)             \
378                 if (domain->iommu_refcnt[idx])
379
380 struct dmar_domain {
381         int     nid;                    /* node id */
382
383         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
384                                         /* Refcount of devices per iommu */
385
386
387         u16             iommu_did[DMAR_UNITS_SUPPORTED];
388                                         /* Domain ids per IOMMU. Use u16 since
389                                          * domain ids are 16 bit wide according
390                                          * to VT-d spec, section 9.3 */
391
392         struct list_head devices;       /* all devices' list */
393         struct iova_domain iovad;       /* iova's that belong to this domain */
394
395         struct dma_pte  *pgd;           /* virtual address */
396         int             gaw;            /* max guest address width */
397
398         /* adjusted guest address width, 0 is level 2 30-bit */
399         int             agaw;
400
401         int             flags;          /* flags to find out type of domain */
402
403         int             iommu_coherency;/* indicate coherency of iommu access */
404         int             iommu_snooping; /* indicate snooping control feature*/
405         int             iommu_count;    /* reference count of iommu */
406         int             iommu_superpage;/* Level of superpages supported:
407                                            0 == 4KiB (no superpages), 1 == 2MiB,
408                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
409         spinlock_t      iommu_lock;     /* protect iommu set in domain */
410         u64             max_addr;       /* maximum mapped address */
411
412         struct iommu_domain domain;     /* generic domain data structure for
413                                            iommu core */
414 };
415
416 /* PCI domain-device relationship */
417 struct device_domain_info {
418         struct list_head link;  /* link to domain siblings */
419         struct list_head global; /* link to global list */
420         u8 bus;                 /* PCI bus number */
421         u8 devfn;               /* PCI devfn number */
422         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
423         struct intel_iommu *iommu; /* IOMMU used by this device */
424         struct dmar_domain *domain; /* pointer to domain */
425 };
426
427 struct dmar_rmrr_unit {
428         struct list_head list;          /* list of rmrr units   */
429         struct acpi_dmar_header *hdr;   /* ACPI header          */
430         u64     base_address;           /* reserved base address*/
431         u64     end_address;            /* reserved end address */
432         struct dmar_dev_scope *devices; /* target devices */
433         int     devices_cnt;            /* target device count */
434 };
435
436 struct dmar_atsr_unit {
437         struct list_head list;          /* list of ATSR units */
438         struct acpi_dmar_header *hdr;   /* ACPI header */
439         struct dmar_dev_scope *devices; /* target devices */
440         int devices_cnt;                /* target device count */
441         u8 include_all:1;               /* include all ports */
442 };
443
444 static LIST_HEAD(dmar_atsr_units);
445 static LIST_HEAD(dmar_rmrr_units);
446
447 #define for_each_rmrr_units(rmrr) \
448         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
449
450 static void flush_unmaps_timeout(unsigned long data);
451
452 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
453
454 #define HIGH_WATER_MARK 250
455 struct deferred_flush_tables {
456         int next;
457         struct iova *iova[HIGH_WATER_MARK];
458         struct dmar_domain *domain[HIGH_WATER_MARK];
459         struct page *freelist[HIGH_WATER_MARK];
460 };
461
462 static struct deferred_flush_tables *deferred_flush;
463
464 /* bitmap for indexing intel_iommus */
465 static int g_num_of_iommus;
466
467 static DEFINE_SPINLOCK(async_umap_flush_lock);
468 static LIST_HEAD(unmaps_to_do);
469
470 static int timer_on;
471 static long list_size;
472
473 static void domain_exit(struct dmar_domain *domain);
474 static void domain_remove_dev_info(struct dmar_domain *domain);
475 static void domain_remove_one_dev_info(struct dmar_domain *domain,
476                                        struct device *dev);
477 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
478                                            struct device *dev);
479 static int domain_detach_iommu(struct dmar_domain *domain,
480                                struct intel_iommu *iommu);
481
482 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
483 int dmar_disabled = 0;
484 #else
485 int dmar_disabled = 1;
486 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
487
488 int intel_iommu_enabled = 0;
489 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
490
491 static int dmar_map_gfx = 1;
492 static int dmar_forcedac;
493 static int intel_iommu_strict;
494 static int intel_iommu_superpage = 1;
495 static int intel_iommu_ecs = 1;
496
497 /* We only actually use ECS when PASID support (on the new bit 40)
498  * is also advertised. Some early implementations — the ones with
499  * PASID support on bit 28 — have issues even when we *only* use
500  * extended root/context tables. */
501 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
502                             ecap_pasid(iommu->ecap))
503
504 int intel_iommu_gfx_mapped;
505 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
506
507 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
508 static DEFINE_SPINLOCK(device_domain_lock);
509 static LIST_HEAD(device_domain_list);
510
511 static const struct iommu_ops intel_iommu_ops;
512
513 static bool translation_pre_enabled(struct intel_iommu *iommu)
514 {
515         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
516 }
517
518 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
519 {
520         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
521 }
522
523 static void init_translation_status(struct intel_iommu *iommu)
524 {
525         u32 gsts;
526
527         gsts = readl(iommu->reg + DMAR_GSTS_REG);
528         if (gsts & DMA_GSTS_TES)
529                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
530 }
531
532 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
533 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
534 {
535         return container_of(dom, struct dmar_domain, domain);
536 }
537
538 static int __init intel_iommu_setup(char *str)
539 {
540         if (!str)
541                 return -EINVAL;
542         while (*str) {
543                 if (!strncmp(str, "on", 2)) {
544                         dmar_disabled = 0;
545                         pr_info("IOMMU enabled\n");
546                 } else if (!strncmp(str, "off", 3)) {
547                         dmar_disabled = 1;
548                         pr_info("IOMMU disabled\n");
549                 } else if (!strncmp(str, "igfx_off", 8)) {
550                         dmar_map_gfx = 0;
551                         pr_info("Disable GFX device mapping\n");
552                 } else if (!strncmp(str, "forcedac", 8)) {
553                         pr_info("Forcing DAC for PCI devices\n");
554                         dmar_forcedac = 1;
555                 } else if (!strncmp(str, "strict", 6)) {
556                         pr_info("Disable batched IOTLB flush\n");
557                         intel_iommu_strict = 1;
558                 } else if (!strncmp(str, "sp_off", 6)) {
559                         pr_info("Disable supported super page\n");
560                         intel_iommu_superpage = 0;
561                 } else if (!strncmp(str, "ecs_off", 7)) {
562                         printk(KERN_INFO
563                                 "Intel-IOMMU: disable extended context table support\n");
564                         intel_iommu_ecs = 0;
565                 }
566
567                 str += strcspn(str, ",");
568                 while (*str == ',')
569                         str++;
570         }
571         return 0;
572 }
573 __setup("intel_iommu=", intel_iommu_setup);
574
575 static struct kmem_cache *iommu_domain_cache;
576 static struct kmem_cache *iommu_devinfo_cache;
577
578 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
579 {
580         struct dmar_domain **domains;
581         int idx = did >> 8;
582
583         domains = iommu->domains[idx];
584         if (!domains)
585                 return NULL;
586
587         return domains[did & 0xff];
588 }
589
590 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
591                              struct dmar_domain *domain)
592 {
593         struct dmar_domain **domains;
594         int idx = did >> 8;
595
596         if (!iommu->domains[idx]) {
597                 size_t size = 256 * sizeof(struct dmar_domain *);
598                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
599         }
600
601         domains = iommu->domains[idx];
602         if (WARN_ON(!domains))
603                 return;
604         else
605                 domains[did & 0xff] = domain;
606 }
607
608 static inline void *alloc_pgtable_page(int node)
609 {
610         struct page *page;
611         void *vaddr = NULL;
612
613         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
614         if (page)
615                 vaddr = page_address(page);
616         return vaddr;
617 }
618
619 static inline void free_pgtable_page(void *vaddr)
620 {
621         free_page((unsigned long)vaddr);
622 }
623
624 static inline void *alloc_domain_mem(void)
625 {
626         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
627 }
628
629 static void free_domain_mem(void *vaddr)
630 {
631         kmem_cache_free(iommu_domain_cache, vaddr);
632 }
633
634 static inline void * alloc_devinfo_mem(void)
635 {
636         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
637 }
638
639 static inline void free_devinfo_mem(void *vaddr)
640 {
641         kmem_cache_free(iommu_devinfo_cache, vaddr);
642 }
643
644 static inline int domain_type_is_vm(struct dmar_domain *domain)
645 {
646         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
647 }
648
649 static inline int domain_type_is_si(struct dmar_domain *domain)
650 {
651         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
652 }
653
654 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
655 {
656         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
657                                 DOMAIN_FLAG_STATIC_IDENTITY);
658 }
659
660 static inline int domain_pfn_supported(struct dmar_domain *domain,
661                                        unsigned long pfn)
662 {
663         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
664
665         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
666 }
667
668 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
669 {
670         unsigned long sagaw;
671         int agaw = -1;
672
673         sagaw = cap_sagaw(iommu->cap);
674         for (agaw = width_to_agaw(max_gaw);
675              agaw >= 0; agaw--) {
676                 if (test_bit(agaw, &sagaw))
677                         break;
678         }
679
680         return agaw;
681 }
682
683 /*
684  * Calculate max SAGAW for each iommu.
685  */
686 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
687 {
688         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
689 }
690
691 /*
692  * calculate agaw for each iommu.
693  * "SAGAW" may be different across iommus, use a default agaw, and
694  * get a supported less agaw for iommus that don't support the default agaw.
695  */
696 int iommu_calculate_agaw(struct intel_iommu *iommu)
697 {
698         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
699 }
700
701 /* This functionin only returns single iommu in a domain */
702 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
703 {
704         int iommu_id;
705
706         /* si_domain and vm domain should not get here. */
707         BUG_ON(domain_type_is_vm_or_si(domain));
708         for_each_domain_iommu(iommu_id, domain)
709                 break;
710
711         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
712                 return NULL;
713
714         return g_iommus[iommu_id];
715 }
716
717 static void domain_update_iommu_coherency(struct dmar_domain *domain)
718 {
719         struct dmar_drhd_unit *drhd;
720         struct intel_iommu *iommu;
721         bool found = false;
722         int i;
723
724         domain->iommu_coherency = 1;
725
726         for_each_domain_iommu(i, domain) {
727                 found = true;
728                 if (!ecap_coherent(g_iommus[i]->ecap)) {
729                         domain->iommu_coherency = 0;
730                         break;
731                 }
732         }
733         if (found)
734                 return;
735
736         /* No hardware attached; use lowest common denominator */
737         rcu_read_lock();
738         for_each_active_iommu(iommu, drhd) {
739                 if (!ecap_coherent(iommu->ecap)) {
740                         domain->iommu_coherency = 0;
741                         break;
742                 }
743         }
744         rcu_read_unlock();
745 }
746
747 static int domain_update_iommu_snooping(struct intel_iommu *skip)
748 {
749         struct dmar_drhd_unit *drhd;
750         struct intel_iommu *iommu;
751         int ret = 1;
752
753         rcu_read_lock();
754         for_each_active_iommu(iommu, drhd) {
755                 if (iommu != skip) {
756                         if (!ecap_sc_support(iommu->ecap)) {
757                                 ret = 0;
758                                 break;
759                         }
760                 }
761         }
762         rcu_read_unlock();
763
764         return ret;
765 }
766
767 static int domain_update_iommu_superpage(struct intel_iommu *skip)
768 {
769         struct dmar_drhd_unit *drhd;
770         struct intel_iommu *iommu;
771         int mask = 0xf;
772
773         if (!intel_iommu_superpage) {
774                 return 0;
775         }
776
777         /* set iommu_superpage to the smallest common denominator */
778         rcu_read_lock();
779         for_each_active_iommu(iommu, drhd) {
780                 if (iommu != skip) {
781                         mask &= cap_super_page_val(iommu->cap);
782                         if (!mask)
783                                 break;
784                 }
785         }
786         rcu_read_unlock();
787
788         return fls(mask);
789 }
790
791 /* Some capabilities may be different across iommus */
792 static void domain_update_iommu_cap(struct dmar_domain *domain)
793 {
794         domain_update_iommu_coherency(domain);
795         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
796         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
797 }
798
799 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
800                                                        u8 bus, u8 devfn, int alloc)
801 {
802         struct root_entry *root = &iommu->root_entry[bus];
803         struct context_entry *context;
804         u64 *entry;
805
806         if (ecs_enabled(iommu)) {
807                 if (devfn >= 0x80) {
808                         devfn -= 0x80;
809                         entry = &root->hi;
810                 }
811                 devfn *= 2;
812         }
813         entry = &root->lo;
814         if (*entry & 1)
815                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
816         else {
817                 unsigned long phy_addr;
818                 if (!alloc)
819                         return NULL;
820
821                 context = alloc_pgtable_page(iommu->node);
822                 if (!context)
823                         return NULL;
824
825                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
826                 phy_addr = virt_to_phys((void *)context);
827                 *entry = phy_addr | 1;
828                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
829         }
830         return &context[devfn];
831 }
832
833 static int iommu_dummy(struct device *dev)
834 {
835         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
836 }
837
838 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
839 {
840         struct dmar_drhd_unit *drhd = NULL;
841         struct intel_iommu *iommu;
842         struct device *tmp;
843         struct pci_dev *ptmp, *pdev = NULL;
844         u16 segment = 0;
845         int i;
846
847         if (iommu_dummy(dev))
848                 return NULL;
849
850         if (dev_is_pci(dev)) {
851                 pdev = to_pci_dev(dev);
852                 segment = pci_domain_nr(pdev->bus);
853         } else if (has_acpi_companion(dev))
854                 dev = &ACPI_COMPANION(dev)->dev;
855
856         rcu_read_lock();
857         for_each_active_iommu(iommu, drhd) {
858                 if (pdev && segment != drhd->segment)
859                         continue;
860
861                 for_each_active_dev_scope(drhd->devices,
862                                           drhd->devices_cnt, i, tmp) {
863                         if (tmp == dev) {
864                                 *bus = drhd->devices[i].bus;
865                                 *devfn = drhd->devices[i].devfn;
866                                 goto out;
867                         }
868
869                         if (!pdev || !dev_is_pci(tmp))
870                                 continue;
871
872                         ptmp = to_pci_dev(tmp);
873                         if (ptmp->subordinate &&
874                             ptmp->subordinate->number <= pdev->bus->number &&
875                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
876                                 goto got_pdev;
877                 }
878
879                 if (pdev && drhd->include_all) {
880                 got_pdev:
881                         *bus = pdev->bus->number;
882                         *devfn = pdev->devfn;
883                         goto out;
884                 }
885         }
886         iommu = NULL;
887  out:
888         rcu_read_unlock();
889
890         return iommu;
891 }
892
893 static void domain_flush_cache(struct dmar_domain *domain,
894                                void *addr, int size)
895 {
896         if (!domain->iommu_coherency)
897                 clflush_cache_range(addr, size);
898 }
899
900 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
901 {
902         struct context_entry *context;
903         int ret = 0;
904         unsigned long flags;
905
906         spin_lock_irqsave(&iommu->lock, flags);
907         context = iommu_context_addr(iommu, bus, devfn, 0);
908         if (context)
909                 ret = context_present(context);
910         spin_unlock_irqrestore(&iommu->lock, flags);
911         return ret;
912 }
913
914 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
915 {
916         struct context_entry *context;
917         unsigned long flags;
918
919         spin_lock_irqsave(&iommu->lock, flags);
920         context = iommu_context_addr(iommu, bus, devfn, 0);
921         if (context) {
922                 context_clear_entry(context);
923                 __iommu_flush_cache(iommu, context, sizeof(*context));
924         }
925         spin_unlock_irqrestore(&iommu->lock, flags);
926 }
927
928 static void free_context_table(struct intel_iommu *iommu)
929 {
930         int i;
931         unsigned long flags;
932         struct context_entry *context;
933
934         spin_lock_irqsave(&iommu->lock, flags);
935         if (!iommu->root_entry) {
936                 goto out;
937         }
938         for (i = 0; i < ROOT_ENTRY_NR; i++) {
939                 context = iommu_context_addr(iommu, i, 0, 0);
940                 if (context)
941                         free_pgtable_page(context);
942
943                 if (!ecs_enabled(iommu))
944                         continue;
945
946                 context = iommu_context_addr(iommu, i, 0x80, 0);
947                 if (context)
948                         free_pgtable_page(context);
949
950         }
951         free_pgtable_page(iommu->root_entry);
952         iommu->root_entry = NULL;
953 out:
954         spin_unlock_irqrestore(&iommu->lock, flags);
955 }
956
957 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
958                                       unsigned long pfn, int *target_level)
959 {
960         struct dma_pte *parent, *pte = NULL;
961         int level = agaw_to_level(domain->agaw);
962         int offset;
963
964         BUG_ON(!domain->pgd);
965
966         if (!domain_pfn_supported(domain, pfn))
967                 /* Address beyond IOMMU's addressing capabilities. */
968                 return NULL;
969
970         parent = domain->pgd;
971
972         while (1) {
973                 void *tmp_page;
974
975                 offset = pfn_level_offset(pfn, level);
976                 pte = &parent[offset];
977                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
978                         break;
979                 if (level == *target_level)
980                         break;
981
982                 if (!dma_pte_present(pte)) {
983                         uint64_t pteval;
984
985                         tmp_page = alloc_pgtable_page(domain->nid);
986
987                         if (!tmp_page)
988                                 return NULL;
989
990                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
991                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
992                         if (cmpxchg64(&pte->val, 0ULL, pteval))
993                                 /* Someone else set it while we were thinking; use theirs. */
994                                 free_pgtable_page(tmp_page);
995                         else
996                                 domain_flush_cache(domain, pte, sizeof(*pte));
997                 }
998                 if (level == 1)
999                         break;
1000
1001                 parent = phys_to_virt(dma_pte_addr(pte));
1002                 level--;
1003         }
1004
1005         if (!*target_level)
1006                 *target_level = level;
1007
1008         return pte;
1009 }
1010
1011
1012 /* return address's pte at specific level */
1013 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1014                                          unsigned long pfn,
1015                                          int level, int *large_page)
1016 {
1017         struct dma_pte *parent, *pte = NULL;
1018         int total = agaw_to_level(domain->agaw);
1019         int offset;
1020
1021         parent = domain->pgd;
1022         while (level <= total) {
1023                 offset = pfn_level_offset(pfn, total);
1024                 pte = &parent[offset];
1025                 if (level == total)
1026                         return pte;
1027
1028                 if (!dma_pte_present(pte)) {
1029                         *large_page = total;
1030                         break;
1031                 }
1032
1033                 if (dma_pte_superpage(pte)) {
1034                         *large_page = total;
1035                         return pte;
1036                 }
1037
1038                 parent = phys_to_virt(dma_pte_addr(pte));
1039                 total--;
1040         }
1041         return NULL;
1042 }
1043
1044 /* clear last level pte, a tlb flush should be followed */
1045 static void dma_pte_clear_range(struct dmar_domain *domain,
1046                                 unsigned long start_pfn,
1047                                 unsigned long last_pfn)
1048 {
1049         unsigned int large_page = 1;
1050         struct dma_pte *first_pte, *pte;
1051
1052         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1053         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1054         BUG_ON(start_pfn > last_pfn);
1055
1056         /* we don't need lock here; nobody else touches the iova range */
1057         do {
1058                 large_page = 1;
1059                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1060                 if (!pte) {
1061                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1062                         continue;
1063                 }
1064                 do {
1065                         dma_clear_pte(pte);
1066                         start_pfn += lvl_to_nr_pages(large_page);
1067                         pte++;
1068                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1069
1070                 domain_flush_cache(domain, first_pte,
1071                                    (void *)pte - (void *)first_pte);
1072
1073         } while (start_pfn && start_pfn <= last_pfn);
1074 }
1075
1076 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1077                                struct dma_pte *pte, unsigned long pfn,
1078                                unsigned long start_pfn, unsigned long last_pfn)
1079 {
1080         pfn = max(start_pfn, pfn);
1081         pte = &pte[pfn_level_offset(pfn, level)];
1082
1083         do {
1084                 unsigned long level_pfn;
1085                 struct dma_pte *level_pte;
1086
1087                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1088                         goto next;
1089
1090                 level_pfn = pfn & level_mask(level - 1);
1091                 level_pte = phys_to_virt(dma_pte_addr(pte));
1092
1093                 if (level > 2)
1094                         dma_pte_free_level(domain, level - 1, level_pte,
1095                                            level_pfn, start_pfn, last_pfn);
1096
1097                 /* If range covers entire pagetable, free it */
1098                 if (!(start_pfn > level_pfn ||
1099                       last_pfn < level_pfn + level_size(level) - 1)) {
1100                         dma_clear_pte(pte);
1101                         domain_flush_cache(domain, pte, sizeof(*pte));
1102                         free_pgtable_page(level_pte);
1103                 }
1104 next:
1105                 pfn += level_size(level);
1106         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1107 }
1108
1109 /* free page table pages. last level pte should already be cleared */
1110 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1111                                    unsigned long start_pfn,
1112                                    unsigned long last_pfn)
1113 {
1114         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1115         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1116         BUG_ON(start_pfn > last_pfn);
1117
1118         dma_pte_clear_range(domain, start_pfn, last_pfn);
1119
1120         /* We don't need lock here; nobody else touches the iova range */
1121         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1122                            domain->pgd, 0, start_pfn, last_pfn);
1123
1124         /* free pgd */
1125         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1126                 free_pgtable_page(domain->pgd);
1127                 domain->pgd = NULL;
1128         }
1129 }
1130
1131 /* When a page at a given level is being unlinked from its parent, we don't
1132    need to *modify* it at all. All we need to do is make a list of all the
1133    pages which can be freed just as soon as we've flushed the IOTLB and we
1134    know the hardware page-walk will no longer touch them.
1135    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1136    be freed. */
1137 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1138                                             int level, struct dma_pte *pte,
1139                                             struct page *freelist)
1140 {
1141         struct page *pg;
1142
1143         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1144         pg->freelist = freelist;
1145         freelist = pg;
1146
1147         if (level == 1)
1148                 return freelist;
1149
1150         pte = page_address(pg);
1151         do {
1152                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1153                         freelist = dma_pte_list_pagetables(domain, level - 1,
1154                                                            pte, freelist);
1155                 pte++;
1156         } while (!first_pte_in_page(pte));
1157
1158         return freelist;
1159 }
1160
1161 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1162                                         struct dma_pte *pte, unsigned long pfn,
1163                                         unsigned long start_pfn,
1164                                         unsigned long last_pfn,
1165                                         struct page *freelist)
1166 {
1167         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1168
1169         pfn = max(start_pfn, pfn);
1170         pte = &pte[pfn_level_offset(pfn, level)];
1171
1172         do {
1173                 unsigned long level_pfn;
1174
1175                 if (!dma_pte_present(pte))
1176                         goto next;
1177
1178                 level_pfn = pfn & level_mask(level);
1179
1180                 /* If range covers entire pagetable, free it */
1181                 if (start_pfn <= level_pfn &&
1182                     last_pfn >= level_pfn + level_size(level) - 1) {
1183                         /* These suborbinate page tables are going away entirely. Don't
1184                            bother to clear them; we're just going to *free* them. */
1185                         if (level > 1 && !dma_pte_superpage(pte))
1186                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1187
1188                         dma_clear_pte(pte);
1189                         if (!first_pte)
1190                                 first_pte = pte;
1191                         last_pte = pte;
1192                 } else if (level > 1) {
1193                         /* Recurse down into a level that isn't *entirely* obsolete */
1194                         freelist = dma_pte_clear_level(domain, level - 1,
1195                                                        phys_to_virt(dma_pte_addr(pte)),
1196                                                        level_pfn, start_pfn, last_pfn,
1197                                                        freelist);
1198                 }
1199 next:
1200                 pfn += level_size(level);
1201         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1202
1203         if (first_pte)
1204                 domain_flush_cache(domain, first_pte,
1205                                    (void *)++last_pte - (void *)first_pte);
1206
1207         return freelist;
1208 }
1209
1210 /* We can't just free the pages because the IOMMU may still be walking
1211    the page tables, and may have cached the intermediate levels. The
1212    pages can only be freed after the IOTLB flush has been done. */
1213 struct page *domain_unmap(struct dmar_domain *domain,
1214                           unsigned long start_pfn,
1215                           unsigned long last_pfn)
1216 {
1217         struct page *freelist = NULL;
1218
1219         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1220         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1221         BUG_ON(start_pfn > last_pfn);
1222
1223         /* we don't need lock here; nobody else touches the iova range */
1224         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1225                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1226
1227         /* free pgd */
1228         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1229                 struct page *pgd_page = virt_to_page(domain->pgd);
1230                 pgd_page->freelist = freelist;
1231                 freelist = pgd_page;
1232
1233                 domain->pgd = NULL;
1234         }
1235
1236         return freelist;
1237 }
1238
1239 void dma_free_pagelist(struct page *freelist)
1240 {
1241         struct page *pg;
1242
1243         while ((pg = freelist)) {
1244                 freelist = pg->freelist;
1245                 free_pgtable_page(page_address(pg));
1246         }
1247 }
1248
1249 /* iommu handling */
1250 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1251 {
1252         struct root_entry *root;
1253         unsigned long flags;
1254
1255         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1256         if (!root) {
1257                 pr_err("Allocating root entry for %s failed\n",
1258                         iommu->name);
1259                 return -ENOMEM;
1260         }
1261
1262         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1263
1264         spin_lock_irqsave(&iommu->lock, flags);
1265         iommu->root_entry = root;
1266         spin_unlock_irqrestore(&iommu->lock, flags);
1267
1268         return 0;
1269 }
1270
1271 static void iommu_set_root_entry(struct intel_iommu *iommu)
1272 {
1273         u64 addr;
1274         u32 sts;
1275         unsigned long flag;
1276
1277         addr = virt_to_phys(iommu->root_entry);
1278         if (ecs_enabled(iommu))
1279                 addr |= DMA_RTADDR_RTT;
1280
1281         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1283
1284         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1285
1286         /* Make sure hardware complete it */
1287         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1288                       readl, (sts & DMA_GSTS_RTPS), sts);
1289
1290         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1291 }
1292
1293 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1294 {
1295         u32 val;
1296         unsigned long flag;
1297
1298         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1299                 return;
1300
1301         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1303
1304         /* Make sure hardware complete it */
1305         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1306                       readl, (!(val & DMA_GSTS_WBFS)), val);
1307
1308         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1309 }
1310
1311 /* return value determine if we need a write buffer flush */
1312 static void __iommu_flush_context(struct intel_iommu *iommu,
1313                                   u16 did, u16 source_id, u8 function_mask,
1314                                   u64 type)
1315 {
1316         u64 val = 0;
1317         unsigned long flag;
1318
1319         switch (type) {
1320         case DMA_CCMD_GLOBAL_INVL:
1321                 val = DMA_CCMD_GLOBAL_INVL;
1322                 break;
1323         case DMA_CCMD_DOMAIN_INVL:
1324                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1325                 break;
1326         case DMA_CCMD_DEVICE_INVL:
1327                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1328                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1329                 break;
1330         default:
1331                 BUG();
1332         }
1333         val |= DMA_CCMD_ICC;
1334
1335         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1336         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1337
1338         /* Make sure hardware complete it */
1339         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1340                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1341
1342         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1343 }
1344
1345 /* return value determine if we need a write buffer flush */
1346 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1347                                 u64 addr, unsigned int size_order, u64 type)
1348 {
1349         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1350         u64 val = 0, val_iva = 0;
1351         unsigned long flag;
1352
1353         switch (type) {
1354         case DMA_TLB_GLOBAL_FLUSH:
1355                 /* global flush doesn't need set IVA_REG */
1356                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1357                 break;
1358         case DMA_TLB_DSI_FLUSH:
1359                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1360                 break;
1361         case DMA_TLB_PSI_FLUSH:
1362                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1363                 /* IH bit is passed in as part of address */
1364                 val_iva = size_order | addr;
1365                 break;
1366         default:
1367                 BUG();
1368         }
1369         /* Note: set drain read/write */
1370 #if 0
1371         /*
1372          * This is probably to be super secure.. Looks like we can
1373          * ignore it without any impact.
1374          */
1375         if (cap_read_drain(iommu->cap))
1376                 val |= DMA_TLB_READ_DRAIN;
1377 #endif
1378         if (cap_write_drain(iommu->cap))
1379                 val |= DMA_TLB_WRITE_DRAIN;
1380
1381         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1382         /* Note: Only uses first TLB reg currently */
1383         if (val_iva)
1384                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1385         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1386
1387         /* Make sure hardware complete it */
1388         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1389                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1390
1391         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1392
1393         /* check IOTLB invalidation granularity */
1394         if (DMA_TLB_IAIG(val) == 0)
1395                 pr_err("Flush IOTLB failed\n");
1396         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1397                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1398                         (unsigned long long)DMA_TLB_IIRG(type),
1399                         (unsigned long long)DMA_TLB_IAIG(val));
1400 }
1401
1402 static struct device_domain_info *
1403 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1404                          u8 bus, u8 devfn)
1405 {
1406         bool found = false;
1407         unsigned long flags;
1408         struct device_domain_info *info;
1409         struct pci_dev *pdev;
1410
1411         if (!ecap_dev_iotlb_support(iommu->ecap))
1412                 return NULL;
1413
1414         if (!iommu->qi)
1415                 return NULL;
1416
1417         spin_lock_irqsave(&device_domain_lock, flags);
1418         list_for_each_entry(info, &domain->devices, link)
1419                 if (info->iommu == iommu && info->bus == bus &&
1420                     info->devfn == devfn) {
1421                         found = true;
1422                         break;
1423                 }
1424         spin_unlock_irqrestore(&device_domain_lock, flags);
1425
1426         if (!found || !info->dev || !dev_is_pci(info->dev))
1427                 return NULL;
1428
1429         pdev = to_pci_dev(info->dev);
1430
1431         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1432                 return NULL;
1433
1434         if (!dmar_find_matched_atsr_unit(pdev))
1435                 return NULL;
1436
1437         return info;
1438 }
1439
1440 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1441 {
1442         if (!info || !dev_is_pci(info->dev))
1443                 return;
1444
1445         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1446 }
1447
1448 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1449 {
1450         if (!info->dev || !dev_is_pci(info->dev) ||
1451             !pci_ats_enabled(to_pci_dev(info->dev)))
1452                 return;
1453
1454         pci_disable_ats(to_pci_dev(info->dev));
1455 }
1456
1457 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1458                                   u64 addr, unsigned mask)
1459 {
1460         u16 sid, qdep;
1461         unsigned long flags;
1462         struct device_domain_info *info;
1463
1464         spin_lock_irqsave(&device_domain_lock, flags);
1465         list_for_each_entry(info, &domain->devices, link) {
1466                 struct pci_dev *pdev;
1467                 if (!info->dev || !dev_is_pci(info->dev))
1468                         continue;
1469
1470                 pdev = to_pci_dev(info->dev);
1471                 if (!pci_ats_enabled(pdev))
1472                         continue;
1473
1474                 sid = info->bus << 8 | info->devfn;
1475                 qdep = pci_ats_queue_depth(pdev);
1476                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1477         }
1478         spin_unlock_irqrestore(&device_domain_lock, flags);
1479 }
1480
1481 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1482                                   struct dmar_domain *domain,
1483                                   unsigned long pfn, unsigned int pages,
1484                                   int ih, int map)
1485 {
1486         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1487         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1488         u16 did = domain->iommu_did[iommu->seq_id];
1489
1490         BUG_ON(pages == 0);
1491
1492         if (ih)
1493                 ih = 1 << 6;
1494         /*
1495          * Fallback to domain selective flush if no PSI support or the size is
1496          * too big.
1497          * PSI requires page size to be 2 ^ x, and the base address is naturally
1498          * aligned to the size
1499          */
1500         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1501                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1502                                                 DMA_TLB_DSI_FLUSH);
1503         else
1504                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1505                                                 DMA_TLB_PSI_FLUSH);
1506
1507         /*
1508          * In caching mode, changes of pages from non-present to present require
1509          * flush. However, device IOTLB doesn't need to be flushed in this case.
1510          */
1511         if (!cap_caching_mode(iommu->cap) || !map)
1512                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1513                                       addr, mask);
1514 }
1515
1516 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1517 {
1518         u32 pmen;
1519         unsigned long flags;
1520
1521         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1522         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1523         pmen &= ~DMA_PMEN_EPM;
1524         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1525
1526         /* wait for the protected region status bit to clear */
1527         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1528                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1529
1530         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1531 }
1532
1533 static void iommu_enable_translation(struct intel_iommu *iommu)
1534 {
1535         u32 sts;
1536         unsigned long flags;
1537
1538         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1539         iommu->gcmd |= DMA_GCMD_TE;
1540         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1541
1542         /* Make sure hardware complete it */
1543         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1544                       readl, (sts & DMA_GSTS_TES), sts);
1545
1546         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1547 }
1548
1549 static void iommu_disable_translation(struct intel_iommu *iommu)
1550 {
1551         u32 sts;
1552         unsigned long flag;
1553
1554         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1555         iommu->gcmd &= ~DMA_GCMD_TE;
1556         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1557
1558         /* Make sure hardware complete it */
1559         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1560                       readl, (!(sts & DMA_GSTS_TES)), sts);
1561
1562         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1563 }
1564
1565
1566 static int iommu_init_domains(struct intel_iommu *iommu)
1567 {
1568         u32 ndomains, nlongs;
1569         size_t size;
1570
1571         ndomains = cap_ndoms(iommu->cap);
1572         pr_debug("%s: Number of Domains supported <%d>\n",
1573                  iommu->name, ndomains);
1574         nlongs = BITS_TO_LONGS(ndomains);
1575
1576         spin_lock_init(&iommu->lock);
1577
1578         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1579         if (!iommu->domain_ids) {
1580                 pr_err("%s: Allocating domain id array failed\n",
1581                        iommu->name);
1582                 return -ENOMEM;
1583         }
1584
1585         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1586         iommu->domains = kzalloc(size, GFP_KERNEL);
1587
1588         if (iommu->domains) {
1589                 size = 256 * sizeof(struct dmar_domain *);
1590                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1591         }
1592
1593         if (!iommu->domains || !iommu->domains[0]) {
1594                 pr_err("%s: Allocating domain array failed\n",
1595                        iommu->name);
1596                 kfree(iommu->domain_ids);
1597                 kfree(iommu->domains);
1598                 iommu->domain_ids = NULL;
1599                 iommu->domains    = NULL;
1600                 return -ENOMEM;
1601         }
1602
1603
1604
1605         /*
1606          * If Caching mode is set, then invalid translations are tagged
1607          * with domain-id 0, hence we need to pre-allocate it. We also
1608          * use domain-id 0 as a marker for non-allocated domain-id, so
1609          * make sure it is not used for a real domain.
1610          */
1611         set_bit(0, iommu->domain_ids);
1612
1613         return 0;
1614 }
1615
1616 static void disable_dmar_iommu(struct intel_iommu *iommu)
1617 {
1618         struct device_domain_info *info, *tmp;
1619
1620         if (!iommu->domains || !iommu->domain_ids)
1621                 return;
1622
1623         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1624                 struct dmar_domain *domain;
1625
1626                 if (info->iommu != iommu)
1627                         continue;
1628
1629                 if (!info->dev || !info->domain)
1630                         continue;
1631
1632                 domain = info->domain;
1633
1634                 domain_remove_one_dev_info(domain, info->dev);
1635
1636                 if (!domain_type_is_vm_or_si(domain))
1637                         domain_exit(domain);
1638         }
1639
1640         if (iommu->gcmd & DMA_GCMD_TE)
1641                 iommu_disable_translation(iommu);
1642 }
1643
1644 static void free_dmar_iommu(struct intel_iommu *iommu)
1645 {
1646         if ((iommu->domains) && (iommu->domain_ids)) {
1647                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1648                 int i;
1649
1650                 for (i = 0; i < elems; i++)
1651                         kfree(iommu->domains[i]);
1652                 kfree(iommu->domains);
1653                 kfree(iommu->domain_ids);
1654                 iommu->domains = NULL;
1655                 iommu->domain_ids = NULL;
1656         }
1657
1658         g_iommus[iommu->seq_id] = NULL;
1659
1660         /* free context mapping */
1661         free_context_table(iommu);
1662 }
1663
1664 static struct dmar_domain *alloc_domain(int flags)
1665 {
1666         struct dmar_domain *domain;
1667
1668         domain = alloc_domain_mem();
1669         if (!domain)
1670                 return NULL;
1671
1672         memset(domain, 0, sizeof(*domain));
1673         domain->nid = -1;
1674         domain->flags = flags;
1675         spin_lock_init(&domain->iommu_lock);
1676         INIT_LIST_HEAD(&domain->devices);
1677
1678         return domain;
1679 }
1680
1681 static int __iommu_attach_domain(struct dmar_domain *domain,
1682                                  struct intel_iommu *iommu)
1683 {
1684         int num;
1685         unsigned long ndomains;
1686
1687         num = domain->iommu_did[iommu->seq_id];
1688         if (num)
1689                 return num;
1690
1691         ndomains = cap_ndoms(iommu->cap);
1692         num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1693
1694         if (num < ndomains) {
1695                 set_bit(num, iommu->domain_ids);
1696                 set_iommu_domain(iommu, num, domain);
1697                 domain->iommu_did[iommu->seq_id] = num;
1698         } else {
1699                 num = -ENOSPC;
1700         }
1701
1702         if (num < 0)
1703                 pr_err("%s: No free domain ids\n", iommu->name);
1704
1705         return num;
1706 }
1707
1708 static int iommu_attach_domain(struct dmar_domain *domain,
1709                                struct intel_iommu *iommu)
1710 {
1711         int num;
1712         unsigned long flags;
1713
1714         spin_lock_irqsave(&iommu->lock, flags);
1715         num = __iommu_attach_domain(domain, iommu);
1716         spin_unlock_irqrestore(&iommu->lock, flags);
1717
1718         return num;
1719 }
1720
1721 static void iommu_detach_domain(struct dmar_domain *domain,
1722                                 struct intel_iommu *iommu)
1723 {
1724         unsigned long flags;
1725         int num;
1726
1727         spin_lock_irqsave(&iommu->lock, flags);
1728
1729         num = domain->iommu_did[iommu->seq_id];
1730
1731         if (num == 0)
1732                 return;
1733
1734         clear_bit(num, iommu->domain_ids);
1735         set_iommu_domain(iommu, num, NULL);
1736
1737         spin_unlock_irqrestore(&iommu->lock, flags);
1738 }
1739
1740 static void domain_attach_iommu(struct dmar_domain *domain,
1741                                struct intel_iommu *iommu)
1742 {
1743         unsigned long flags;
1744
1745         spin_lock_irqsave(&domain->iommu_lock, flags);
1746         domain->iommu_refcnt[iommu->seq_id] += 1;
1747         domain->iommu_count += 1;
1748         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1749                 domain->nid = iommu->node;
1750                 domain_update_iommu_cap(domain);
1751         }
1752         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1753 }
1754
1755 static int domain_detach_iommu(struct dmar_domain *domain,
1756                                struct intel_iommu *iommu)
1757 {
1758         unsigned long flags;
1759         int count = INT_MAX;
1760
1761         spin_lock_irqsave(&domain->iommu_lock, flags);
1762         domain->iommu_refcnt[iommu->seq_id] -= 1;
1763         count = --domain->iommu_count;
1764         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1765                 domain_update_iommu_cap(domain);
1766                 domain->iommu_did[iommu->seq_id] = 0;
1767         }
1768         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1769
1770         return count;
1771 }
1772
1773 static struct iova_domain reserved_iova_list;
1774 static struct lock_class_key reserved_rbtree_key;
1775
1776 static int dmar_init_reserved_ranges(void)
1777 {
1778         struct pci_dev *pdev = NULL;
1779         struct iova *iova;
1780         int i;
1781
1782         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1783                         DMA_32BIT_PFN);
1784
1785         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1786                 &reserved_rbtree_key);
1787
1788         /* IOAPIC ranges shouldn't be accessed by DMA */
1789         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1790                 IOVA_PFN(IOAPIC_RANGE_END));
1791         if (!iova) {
1792                 pr_err("Reserve IOAPIC range failed\n");
1793                 return -ENODEV;
1794         }
1795
1796         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1797         for_each_pci_dev(pdev) {
1798                 struct resource *r;
1799
1800                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1801                         r = &pdev->resource[i];
1802                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1803                                 continue;
1804                         iova = reserve_iova(&reserved_iova_list,
1805                                             IOVA_PFN(r->start),
1806                                             IOVA_PFN(r->end));
1807                         if (!iova) {
1808                                 pr_err("Reserve iova failed\n");
1809                                 return -ENODEV;
1810                         }
1811                 }
1812         }
1813         return 0;
1814 }
1815
1816 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1817 {
1818         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1819 }
1820
1821 static inline int guestwidth_to_adjustwidth(int gaw)
1822 {
1823         int agaw;
1824         int r = (gaw - 12) % 9;
1825
1826         if (r == 0)
1827                 agaw = gaw;
1828         else
1829                 agaw = gaw + 9 - r;
1830         if (agaw > 64)
1831                 agaw = 64;
1832         return agaw;
1833 }
1834
1835 static int domain_init(struct dmar_domain *domain, int guest_width)
1836 {
1837         struct intel_iommu *iommu;
1838         int adjust_width, agaw;
1839         unsigned long sagaw;
1840
1841         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1842                         DMA_32BIT_PFN);
1843         domain_reserve_special_ranges(domain);
1844
1845         /* calculate AGAW */
1846         iommu = domain_get_iommu(domain);
1847         if (guest_width > cap_mgaw(iommu->cap))
1848                 guest_width = cap_mgaw(iommu->cap);
1849         domain->gaw = guest_width;
1850         adjust_width = guestwidth_to_adjustwidth(guest_width);
1851         agaw = width_to_agaw(adjust_width);
1852         sagaw = cap_sagaw(iommu->cap);
1853         if (!test_bit(agaw, &sagaw)) {
1854                 /* hardware doesn't support it, choose a bigger one */
1855                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1856                 agaw = find_next_bit(&sagaw, 5, agaw);
1857                 if (agaw >= 5)
1858                         return -ENODEV;
1859         }
1860         domain->agaw = agaw;
1861
1862         if (ecap_coherent(iommu->ecap))
1863                 domain->iommu_coherency = 1;
1864         else
1865                 domain->iommu_coherency = 0;
1866
1867         if (ecap_sc_support(iommu->ecap))
1868                 domain->iommu_snooping = 1;
1869         else
1870                 domain->iommu_snooping = 0;
1871
1872         if (intel_iommu_superpage)
1873                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1874         else
1875                 domain->iommu_superpage = 0;
1876
1877         domain->nid = iommu->node;
1878
1879         /* always allocate the top pgd */
1880         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1881         if (!domain->pgd)
1882                 return -ENOMEM;
1883         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1884         return 0;
1885 }
1886
1887 static void domain_exit(struct dmar_domain *domain)
1888 {
1889         struct page *freelist = NULL;
1890         int i;
1891
1892         /* Domain 0 is reserved, so dont process it */
1893         if (!domain)
1894                 return;
1895
1896         /* Flush any lazy unmaps that may reference this domain */
1897         if (!intel_iommu_strict)
1898                 flush_unmaps_timeout(0);
1899
1900         /* remove associated devices */
1901         domain_remove_dev_info(domain);
1902
1903         /* destroy iovas */
1904         put_iova_domain(&domain->iovad);
1905
1906         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1907
1908         /* clear attached or cached domains */
1909         rcu_read_lock();
1910         for_each_domain_iommu(i, domain)
1911                 iommu_detach_domain(domain, g_iommus[i]);
1912         rcu_read_unlock();
1913
1914         dma_free_pagelist(freelist);
1915
1916         free_domain_mem(domain);
1917 }
1918
1919 static int domain_context_mapping_one(struct dmar_domain *domain,
1920                                       struct intel_iommu *iommu,
1921                                       u8 bus, u8 devfn)
1922 {
1923         int translation = CONTEXT_TT_MULTI_LEVEL;
1924         struct device_domain_info *info = NULL;
1925         struct context_entry *context;
1926         unsigned long flags;
1927         struct dma_pte *pgd;
1928         int id;
1929         int agaw;
1930
1931         if (hw_pass_through && domain_type_is_si(domain))
1932                 translation = CONTEXT_TT_PASS_THROUGH;
1933
1934         pr_debug("Set context mapping for %02x:%02x.%d\n",
1935                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1936
1937         BUG_ON(!domain->pgd);
1938
1939         spin_lock_irqsave(&iommu->lock, flags);
1940         context = iommu_context_addr(iommu, bus, devfn, 1);
1941         spin_unlock_irqrestore(&iommu->lock, flags);
1942         if (!context)
1943                 return -ENOMEM;
1944         spin_lock_irqsave(&iommu->lock, flags);
1945         if (context_present(context)) {
1946                 spin_unlock_irqrestore(&iommu->lock, flags);
1947                 return 0;
1948         }
1949
1950         pgd = domain->pgd;
1951
1952         id = __iommu_attach_domain(domain, iommu);
1953         if (id < 0) {
1954                 spin_unlock_irqrestore(&iommu->lock, flags);
1955                 pr_err("%s: No free domain ids\n", iommu->name);
1956                 return -EFAULT;
1957         }
1958
1959         context_clear_entry(context);
1960         context_set_domain_id(context, id);
1961
1962         /*
1963          * Skip top levels of page tables for iommu which has less agaw
1964          * than default.  Unnecessary for PT mode.
1965          */
1966         if (translation != CONTEXT_TT_PASS_THROUGH) {
1967                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1968                         pgd = phys_to_virt(dma_pte_addr(pgd));
1969                         if (!dma_pte_present(pgd)) {
1970                                 spin_unlock_irqrestore(&iommu->lock, flags);
1971                                 return -ENOMEM;
1972                         }
1973                 }
1974
1975                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1976                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1977                                      CONTEXT_TT_MULTI_LEVEL;
1978
1979                 context_set_address_root(context, virt_to_phys(pgd));
1980                 context_set_address_width(context, iommu->agaw);
1981         } else {
1982                 /*
1983                  * In pass through mode, AW must be programmed to
1984                  * indicate the largest AGAW value supported by
1985                  * hardware. And ASR is ignored by hardware.
1986                  */
1987                 context_set_address_width(context, iommu->msagaw);
1988         }
1989
1990         context_set_translation_type(context, translation);
1991         context_set_fault_enable(context);
1992         context_set_present(context);
1993         domain_flush_cache(domain, context, sizeof(*context));
1994
1995         /*
1996          * It's a non-present to present mapping. If hardware doesn't cache
1997          * non-present entry we only need to flush the write-buffer. If the
1998          * _does_ cache non-present entries, then it does so in the special
1999          * domain #0, which we have to flush:
2000          */
2001         if (cap_caching_mode(iommu->cap)) {
2002                 iommu->flush.flush_context(iommu, 0,
2003                                            (((u16)bus) << 8) | devfn,
2004                                            DMA_CCMD_MASK_NOBIT,
2005                                            DMA_CCMD_DEVICE_INVL);
2006                 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
2007         } else {
2008                 iommu_flush_write_buffer(iommu);
2009         }
2010         iommu_enable_dev_iotlb(info);
2011         spin_unlock_irqrestore(&iommu->lock, flags);
2012
2013         domain_attach_iommu(domain, iommu);
2014
2015         return 0;
2016 }
2017
2018 struct domain_context_mapping_data {
2019         struct dmar_domain *domain;
2020         struct intel_iommu *iommu;
2021 };
2022
2023 static int domain_context_mapping_cb(struct pci_dev *pdev,
2024                                      u16 alias, void *opaque)
2025 {
2026         struct domain_context_mapping_data *data = opaque;
2027
2028         return domain_context_mapping_one(data->domain, data->iommu,
2029                                           PCI_BUS_NUM(alias), alias & 0xff);
2030 }
2031
2032 static int
2033 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2034 {
2035         struct intel_iommu *iommu;
2036         u8 bus, devfn;
2037         struct domain_context_mapping_data data;
2038
2039         iommu = device_to_iommu(dev, &bus, &devfn);
2040         if (!iommu)
2041                 return -ENODEV;
2042
2043         if (!dev_is_pci(dev))
2044                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2045
2046         data.domain = domain;
2047         data.iommu = iommu;
2048
2049         return pci_for_each_dma_alias(to_pci_dev(dev),
2050                                       &domain_context_mapping_cb, &data);
2051 }
2052
2053 static int domain_context_mapped_cb(struct pci_dev *pdev,
2054                                     u16 alias, void *opaque)
2055 {
2056         struct intel_iommu *iommu = opaque;
2057
2058         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2059 }
2060
2061 static int domain_context_mapped(struct device *dev)
2062 {
2063         struct intel_iommu *iommu;
2064         u8 bus, devfn;
2065
2066         iommu = device_to_iommu(dev, &bus, &devfn);
2067         if (!iommu)
2068                 return -ENODEV;
2069
2070         if (!dev_is_pci(dev))
2071                 return device_context_mapped(iommu, bus, devfn);
2072
2073         return !pci_for_each_dma_alias(to_pci_dev(dev),
2074                                        domain_context_mapped_cb, iommu);
2075 }
2076
2077 /* Returns a number of VTD pages, but aligned to MM page size */
2078 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2079                                             size_t size)
2080 {
2081         host_addr &= ~PAGE_MASK;
2082         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2083 }
2084
2085 /* Return largest possible superpage level for a given mapping */
2086 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2087                                           unsigned long iov_pfn,
2088                                           unsigned long phy_pfn,
2089                                           unsigned long pages)
2090 {
2091         int support, level = 1;
2092         unsigned long pfnmerge;
2093
2094         support = domain->iommu_superpage;
2095
2096         /* To use a large page, the virtual *and* physical addresses
2097            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2098            of them will mean we have to use smaller pages. So just
2099            merge them and check both at once. */
2100         pfnmerge = iov_pfn | phy_pfn;
2101
2102         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2103                 pages >>= VTD_STRIDE_SHIFT;
2104                 if (!pages)
2105                         break;
2106                 pfnmerge >>= VTD_STRIDE_SHIFT;
2107                 level++;
2108                 support--;
2109         }
2110         return level;
2111 }
2112
2113 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2114                             struct scatterlist *sg, unsigned long phys_pfn,
2115                             unsigned long nr_pages, int prot)
2116 {
2117         struct dma_pte *first_pte = NULL, *pte = NULL;
2118         phys_addr_t uninitialized_var(pteval);
2119         unsigned long sg_res = 0;
2120         unsigned int largepage_lvl = 0;
2121         unsigned long lvl_pages = 0;
2122
2123         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2124
2125         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2126                 return -EINVAL;
2127
2128         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2129
2130         if (!sg) {
2131                 sg_res = nr_pages;
2132                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2133         }
2134
2135         while (nr_pages > 0) {
2136                 uint64_t tmp;
2137
2138                 if (!sg_res) {
2139                         sg_res = aligned_nrpages(sg->offset, sg->length);
2140                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2141                         sg->dma_length = sg->length;
2142                         pteval = page_to_phys(sg_page(sg)) | prot;
2143                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2144                 }
2145
2146                 if (!pte) {
2147                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2148
2149                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2150                         if (!pte)
2151                                 return -ENOMEM;
2152                         /* It is large page*/
2153                         if (largepage_lvl > 1) {
2154                                 pteval |= DMA_PTE_LARGE_PAGE;
2155                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2156                                 /*
2157                                  * Ensure that old small page tables are
2158                                  * removed to make room for superpage,
2159                                  * if they exist.
2160                                  */
2161                                 dma_pte_free_pagetable(domain, iov_pfn,
2162                                                        iov_pfn + lvl_pages - 1);
2163                         } else {
2164                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2165                         }
2166
2167                 }
2168                 /* We don't need lock here, nobody else
2169                  * touches the iova range
2170                  */
2171                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2172                 if (tmp) {
2173                         static int dumps = 5;
2174                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2175                                 iov_pfn, tmp, (unsigned long long)pteval);
2176                         if (dumps) {
2177                                 dumps--;
2178                                 debug_dma_dump_mappings(NULL);
2179                         }
2180                         WARN_ON(1);
2181                 }
2182
2183                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2184
2185                 BUG_ON(nr_pages < lvl_pages);
2186                 BUG_ON(sg_res < lvl_pages);
2187
2188                 nr_pages -= lvl_pages;
2189                 iov_pfn += lvl_pages;
2190                 phys_pfn += lvl_pages;
2191                 pteval += lvl_pages * VTD_PAGE_SIZE;
2192                 sg_res -= lvl_pages;
2193
2194                 /* If the next PTE would be the first in a new page, then we
2195                    need to flush the cache on the entries we've just written.
2196                    And then we'll need to recalculate 'pte', so clear it and
2197                    let it get set again in the if (!pte) block above.
2198
2199                    If we're done (!nr_pages) we need to flush the cache too.
2200
2201                    Also if we've been setting superpages, we may need to
2202                    recalculate 'pte' and switch back to smaller pages for the
2203                    end of the mapping, if the trailing size is not enough to
2204                    use another superpage (i.e. sg_res < lvl_pages). */
2205                 pte++;
2206                 if (!nr_pages || first_pte_in_page(pte) ||
2207                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2208                         domain_flush_cache(domain, first_pte,
2209                                            (void *)pte - (void *)first_pte);
2210                         pte = NULL;
2211                 }
2212
2213                 if (!sg_res && nr_pages)
2214                         sg = sg_next(sg);
2215         }
2216         return 0;
2217 }
2218
2219 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2220                                     struct scatterlist *sg, unsigned long nr_pages,
2221                                     int prot)
2222 {
2223         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2224 }
2225
2226 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2227                                      unsigned long phys_pfn, unsigned long nr_pages,
2228                                      int prot)
2229 {
2230         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2231 }
2232
2233 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2234 {
2235         if (!iommu)
2236                 return;
2237
2238         clear_context_table(iommu, bus, devfn);
2239         iommu->flush.flush_context(iommu, 0, 0, 0,
2240                                            DMA_CCMD_GLOBAL_INVL);
2241         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2242 }
2243
2244 static inline void unlink_domain_info(struct device_domain_info *info)
2245 {
2246         assert_spin_locked(&device_domain_lock);
2247         list_del(&info->link);
2248         list_del(&info->global);
2249         if (info->dev)
2250                 info->dev->archdata.iommu = NULL;
2251 }
2252
2253 static void domain_remove_dev_info(struct dmar_domain *domain)
2254 {
2255         struct device_domain_info *info, *tmp;
2256         unsigned long flags;
2257
2258         spin_lock_irqsave(&device_domain_lock, flags);
2259         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2260                 unlink_domain_info(info);
2261                 spin_unlock_irqrestore(&device_domain_lock, flags);
2262
2263                 iommu_disable_dev_iotlb(info);
2264                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2265
2266                 if (domain_type_is_vm(domain)) {
2267                         iommu_detach_dependent_devices(info->iommu, info->dev);
2268                         domain_detach_iommu(domain, info->iommu);
2269                 }
2270
2271                 free_devinfo_mem(info);
2272                 spin_lock_irqsave(&device_domain_lock, flags);
2273         }
2274         spin_unlock_irqrestore(&device_domain_lock, flags);
2275 }
2276
2277 /*
2278  * find_domain
2279  * Note: we use struct device->archdata.iommu stores the info
2280  */
2281 static struct dmar_domain *find_domain(struct device *dev)
2282 {
2283         struct device_domain_info *info;
2284
2285         /* No lock here, assumes no domain exit in normal case */
2286         info = dev->archdata.iommu;
2287         if (info)
2288                 return info->domain;
2289         return NULL;
2290 }
2291
2292 static inline struct device_domain_info *
2293 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2294 {
2295         struct device_domain_info *info;
2296
2297         list_for_each_entry(info, &device_domain_list, global)
2298                 if (info->iommu->segment == segment && info->bus == bus &&
2299                     info->devfn == devfn)
2300                         return info;
2301
2302         return NULL;
2303 }
2304
2305 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2306                                                 int bus, int devfn,
2307                                                 struct device *dev,
2308                                                 struct dmar_domain *domain)
2309 {
2310         struct dmar_domain *found = NULL;
2311         struct device_domain_info *info;
2312         unsigned long flags;
2313
2314         info = alloc_devinfo_mem();
2315         if (!info)
2316                 return NULL;
2317
2318         info->bus = bus;
2319         info->devfn = devfn;
2320         info->dev = dev;
2321         info->domain = domain;
2322         info->iommu = iommu;
2323
2324         spin_lock_irqsave(&device_domain_lock, flags);
2325         if (dev)
2326                 found = find_domain(dev);
2327         else {
2328                 struct device_domain_info *info2;
2329                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2330                 if (info2)
2331                         found = info2->domain;
2332         }
2333         if (found) {
2334                 spin_unlock_irqrestore(&device_domain_lock, flags);
2335                 free_devinfo_mem(info);
2336                 /* Caller must free the original domain */
2337                 return found;
2338         }
2339
2340         list_add(&info->link, &domain->devices);
2341         list_add(&info->global, &device_domain_list);
2342         if (dev)
2343                 dev->archdata.iommu = info;
2344         spin_unlock_irqrestore(&device_domain_lock, flags);
2345
2346         return domain;
2347 }
2348
2349 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2350 {
2351         *(u16 *)opaque = alias;
2352         return 0;
2353 }
2354
2355 /* domain is initialized */
2356 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2357 {
2358         struct dmar_domain *domain, *tmp;
2359         struct intel_iommu *iommu;
2360         struct device_domain_info *info;
2361         u16 dma_alias;
2362         unsigned long flags;
2363         u8 bus, devfn;
2364
2365         domain = find_domain(dev);
2366         if (domain)
2367                 return domain;
2368
2369         iommu = device_to_iommu(dev, &bus, &devfn);
2370         if (!iommu)
2371                 return NULL;
2372
2373         if (dev_is_pci(dev)) {
2374                 struct pci_dev *pdev = to_pci_dev(dev);
2375
2376                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2377
2378                 spin_lock_irqsave(&device_domain_lock, flags);
2379                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2380                                                       PCI_BUS_NUM(dma_alias),
2381                                                       dma_alias & 0xff);
2382                 if (info) {
2383                         iommu = info->iommu;
2384                         domain = info->domain;
2385                 }
2386                 spin_unlock_irqrestore(&device_domain_lock, flags);
2387
2388                 /* DMA alias already has a domain, uses it */
2389                 if (info)
2390                         goto found_domain;
2391         }
2392
2393         /* Allocate and initialize new domain for the device */
2394         domain = alloc_domain(0);
2395         if (!domain)
2396                 return NULL;
2397         if (iommu_attach_domain(domain, iommu) < 0) {
2398                 free_domain_mem(domain);
2399                 return NULL;
2400         }
2401         domain_attach_iommu(domain, iommu);
2402         if (domain_init(domain, gaw)) {
2403                 domain_exit(domain);
2404                 return NULL;
2405         }
2406
2407         /* register PCI DMA alias device */
2408         if (dev_is_pci(dev)) {
2409                 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2410                                            dma_alias & 0xff, NULL, domain);
2411
2412                 if (!tmp || tmp != domain) {
2413                         domain_exit(domain);
2414                         domain = tmp;
2415                 }
2416
2417                 if (!domain)
2418                         return NULL;
2419         }
2420
2421 found_domain:
2422         tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2423
2424         if (!tmp || tmp != domain) {
2425                 domain_exit(domain);
2426                 domain = tmp;
2427         }
2428
2429         return domain;
2430 }
2431
2432 static int iommu_identity_mapping;
2433 #define IDENTMAP_ALL            1
2434 #define IDENTMAP_GFX            2
2435 #define IDENTMAP_AZALIA         4
2436
2437 static int iommu_domain_identity_map(struct dmar_domain *domain,
2438                                      unsigned long long start,
2439                                      unsigned long long end)
2440 {
2441         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2442         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2443
2444         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2445                           dma_to_mm_pfn(last_vpfn))) {
2446                 pr_err("Reserving iova failed\n");
2447                 return -ENOMEM;
2448         }
2449
2450         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2451         /*
2452          * RMRR range might have overlap with physical memory range,
2453          * clear it first
2454          */
2455         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2456
2457         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2458                                   last_vpfn - first_vpfn + 1,
2459                                   DMA_PTE_READ|DMA_PTE_WRITE);
2460 }
2461
2462 static int iommu_prepare_identity_map(struct device *dev,
2463                                       unsigned long long start,
2464                                       unsigned long long end)
2465 {
2466         struct dmar_domain *domain;
2467         int ret;
2468
2469         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2470         if (!domain)
2471                 return -ENOMEM;
2472
2473         /* For _hardware_ passthrough, don't bother. But for software
2474            passthrough, we do it anyway -- it may indicate a memory
2475            range which is reserved in E820, so which didn't get set
2476            up to start with in si_domain */
2477         if (domain == si_domain && hw_pass_through) {
2478                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2479                         dev_name(dev), start, end);
2480                 return 0;
2481         }
2482
2483         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2484                 dev_name(dev), start, end);
2485
2486         if (end < start) {
2487                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2488                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2489                         dmi_get_system_info(DMI_BIOS_VENDOR),
2490                         dmi_get_system_info(DMI_BIOS_VERSION),
2491                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2492                 ret = -EIO;
2493                 goto error;
2494         }
2495
2496         if (end >> agaw_to_width(domain->agaw)) {
2497                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2498                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2499                      agaw_to_width(domain->agaw),
2500                      dmi_get_system_info(DMI_BIOS_VENDOR),
2501                      dmi_get_system_info(DMI_BIOS_VERSION),
2502                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2503                 ret = -EIO;
2504                 goto error;
2505         }
2506
2507         ret = iommu_domain_identity_map(domain, start, end);
2508         if (ret)
2509                 goto error;
2510
2511         /* context entry init */
2512         ret = domain_context_mapping(domain, dev);
2513         if (ret)
2514                 goto error;
2515
2516         return 0;
2517
2518  error:
2519         domain_exit(domain);
2520         return ret;
2521 }
2522
2523 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2524                                          struct device *dev)
2525 {
2526         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2527                 return 0;
2528         return iommu_prepare_identity_map(dev, rmrr->base_address,
2529                                           rmrr->end_address);
2530 }
2531
2532 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2533 static inline void iommu_prepare_isa(void)
2534 {
2535         struct pci_dev *pdev;
2536         int ret;
2537
2538         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2539         if (!pdev)
2540                 return;
2541
2542         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2543         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2544
2545         if (ret)
2546                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2547
2548         pci_dev_put(pdev);
2549 }
2550 #else
2551 static inline void iommu_prepare_isa(void)
2552 {
2553         return;
2554 }
2555 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2556
2557 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2558
2559 static int __init si_domain_init(int hw)
2560 {
2561         int nid, ret = 0;
2562
2563         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2564         if (!si_domain)
2565                 return -EFAULT;
2566
2567         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2568                 domain_exit(si_domain);
2569                 return -EFAULT;
2570         }
2571
2572         pr_debug("Identity mapping domain allocated\n");
2573
2574         if (hw)
2575                 return 0;
2576
2577         for_each_online_node(nid) {
2578                 unsigned long start_pfn, end_pfn;
2579                 int i;
2580
2581                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2582                         ret = iommu_domain_identity_map(si_domain,
2583                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2584                         if (ret)
2585                                 return ret;
2586                 }
2587         }
2588
2589         return 0;
2590 }
2591
2592 static int identity_mapping(struct device *dev)
2593 {
2594         struct device_domain_info *info;
2595
2596         if (likely(!iommu_identity_mapping))
2597                 return 0;
2598
2599         info = dev->archdata.iommu;
2600         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2601                 return (info->domain == si_domain);
2602
2603         return 0;
2604 }
2605
2606 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2607 {
2608         struct dmar_domain *ndomain;
2609         struct intel_iommu *iommu;
2610         u8 bus, devfn;
2611         int ret;
2612
2613         iommu = device_to_iommu(dev, &bus, &devfn);
2614         if (!iommu)
2615                 return -ENODEV;
2616
2617         ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2618         if (ndomain != domain)
2619                 return -EBUSY;
2620
2621         ret = domain_context_mapping(domain, dev);
2622         if (ret) {
2623                 domain_remove_one_dev_info(domain, dev);
2624                 return ret;
2625         }
2626
2627         return 0;
2628 }
2629
2630 static bool device_has_rmrr(struct device *dev)
2631 {
2632         struct dmar_rmrr_unit *rmrr;
2633         struct device *tmp;
2634         int i;
2635
2636         rcu_read_lock();
2637         for_each_rmrr_units(rmrr) {
2638                 /*
2639                  * Return TRUE if this RMRR contains the device that
2640                  * is passed in.
2641                  */
2642                 for_each_active_dev_scope(rmrr->devices,
2643                                           rmrr->devices_cnt, i, tmp)
2644                         if (tmp == dev) {
2645                                 rcu_read_unlock();
2646                                 return true;
2647                         }
2648         }
2649         rcu_read_unlock();
2650         return false;
2651 }
2652
2653 /*
2654  * There are a couple cases where we need to restrict the functionality of
2655  * devices associated with RMRRs.  The first is when evaluating a device for
2656  * identity mapping because problems exist when devices are moved in and out
2657  * of domains and their respective RMRR information is lost.  This means that
2658  * a device with associated RMRRs will never be in a "passthrough" domain.
2659  * The second is use of the device through the IOMMU API.  This interface
2660  * expects to have full control of the IOVA space for the device.  We cannot
2661  * satisfy both the requirement that RMRR access is maintained and have an
2662  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2663  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2664  * We therefore prevent devices associated with an RMRR from participating in
2665  * the IOMMU API, which eliminates them from device assignment.
2666  *
2667  * In both cases we assume that PCI USB devices with RMRRs have them largely
2668  * for historical reasons and that the RMRR space is not actively used post
2669  * boot.  This exclusion may change if vendors begin to abuse it.
2670  *
2671  * The same exception is made for graphics devices, with the requirement that
2672  * any use of the RMRR regions will be torn down before assigning the device
2673  * to a guest.
2674  */
2675 static bool device_is_rmrr_locked(struct device *dev)
2676 {
2677         if (!device_has_rmrr(dev))
2678                 return false;
2679
2680         if (dev_is_pci(dev)) {
2681                 struct pci_dev *pdev = to_pci_dev(dev);
2682
2683                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2684                         return false;
2685         }
2686
2687         return true;
2688 }
2689
2690 static int iommu_should_identity_map(struct device *dev, int startup)
2691 {
2692
2693         if (dev_is_pci(dev)) {
2694                 struct pci_dev *pdev = to_pci_dev(dev);
2695
2696                 if (device_is_rmrr_locked(dev))
2697                         return 0;
2698
2699                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2700                         return 1;
2701
2702                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2703                         return 1;
2704
2705                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2706                         return 0;
2707
2708                 /*
2709                  * We want to start off with all devices in the 1:1 domain, and
2710                  * take them out later if we find they can't access all of memory.
2711                  *
2712                  * However, we can't do this for PCI devices behind bridges,
2713                  * because all PCI devices behind the same bridge will end up
2714                  * with the same source-id on their transactions.
2715                  *
2716                  * Practically speaking, we can't change things around for these
2717                  * devices at run-time, because we can't be sure there'll be no
2718                  * DMA transactions in flight for any of their siblings.
2719                  *
2720                  * So PCI devices (unless they're on the root bus) as well as
2721                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2722                  * the 1:1 domain, just in _case_ one of their siblings turns out
2723                  * not to be able to map all of memory.
2724                  */
2725                 if (!pci_is_pcie(pdev)) {
2726                         if (!pci_is_root_bus(pdev->bus))
2727                                 return 0;
2728                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2729                                 return 0;
2730                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2731                         return 0;
2732         } else {
2733                 if (device_has_rmrr(dev))
2734                         return 0;
2735         }
2736
2737         /*
2738          * At boot time, we don't yet know if devices will be 64-bit capable.
2739          * Assume that they will — if they turn out not to be, then we can
2740          * take them out of the 1:1 domain later.
2741          */
2742         if (!startup) {
2743                 /*
2744                  * If the device's dma_mask is less than the system's memory
2745                  * size then this is not a candidate for identity mapping.
2746                  */
2747                 u64 dma_mask = *dev->dma_mask;
2748
2749                 if (dev->coherent_dma_mask &&
2750                     dev->coherent_dma_mask < dma_mask)
2751                         dma_mask = dev->coherent_dma_mask;
2752
2753                 return dma_mask >= dma_get_required_mask(dev);
2754         }
2755
2756         return 1;
2757 }
2758
2759 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2760 {
2761         int ret;
2762
2763         if (!iommu_should_identity_map(dev, 1))
2764                 return 0;
2765
2766         ret = domain_add_dev_info(si_domain, dev);
2767         if (!ret)
2768                 pr_info("%s identity mapping for device %s\n",
2769                         hw ? "Hardware" : "Software", dev_name(dev));
2770         else if (ret == -ENODEV)
2771                 /* device not associated with an iommu */
2772                 ret = 0;
2773
2774         return ret;
2775 }
2776
2777
2778 static int __init iommu_prepare_static_identity_mapping(int hw)
2779 {
2780         struct pci_dev *pdev = NULL;
2781         struct dmar_drhd_unit *drhd;
2782         struct intel_iommu *iommu;
2783         struct device *dev;
2784         int i;
2785         int ret = 0;
2786
2787         for_each_pci_dev(pdev) {
2788                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2789                 if (ret)
2790                         return ret;
2791         }
2792
2793         for_each_active_iommu(iommu, drhd)
2794                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2795                         struct acpi_device_physical_node *pn;
2796                         struct acpi_device *adev;
2797
2798                         if (dev->bus != &acpi_bus_type)
2799                                 continue;
2800
2801                         adev= to_acpi_device(dev);
2802                         mutex_lock(&adev->physical_node_lock);
2803                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2804                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2805                                 if (ret)
2806                                         break;
2807                         }
2808                         mutex_unlock(&adev->physical_node_lock);
2809                         if (ret)
2810                                 return ret;
2811                 }
2812
2813         return 0;
2814 }
2815
2816 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2817 {
2818         /*
2819          * Start from the sane iommu hardware state.
2820          * If the queued invalidation is already initialized by us
2821          * (for example, while enabling interrupt-remapping) then
2822          * we got the things already rolling from a sane state.
2823          */
2824         if (!iommu->qi) {
2825                 /*
2826                  * Clear any previous faults.
2827                  */
2828                 dmar_fault(-1, iommu);
2829                 /*
2830                  * Disable queued invalidation if supported and already enabled
2831                  * before OS handover.
2832                  */
2833                 dmar_disable_qi(iommu);
2834         }
2835
2836         if (dmar_enable_qi(iommu)) {
2837                 /*
2838                  * Queued Invalidate not enabled, use Register Based Invalidate
2839                  */
2840                 iommu->flush.flush_context = __iommu_flush_context;
2841                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2842                 pr_info("%s: Using Register based invalidation\n",
2843                         iommu->name);
2844         } else {
2845                 iommu->flush.flush_context = qi_flush_context;
2846                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2847                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2848         }
2849 }
2850
2851 static int copy_context_table(struct intel_iommu *iommu,
2852                               struct root_entry *old_re,
2853                               struct context_entry **tbl,
2854                               int bus, bool ext)
2855 {
2856         struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
2857         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2858         phys_addr_t old_ce_phys;
2859
2860         tbl_idx = ext ? bus * 2 : bus;
2861
2862         for (devfn = 0; devfn < 256; devfn++) {
2863                 /* First calculate the correct index */
2864                 idx = (ext ? devfn * 2 : devfn) % 256;
2865
2866                 if (idx == 0) {
2867                         /* First save what we may have and clean up */
2868                         if (new_ce) {
2869                                 tbl[tbl_idx] = new_ce;
2870                                 __iommu_flush_cache(iommu, new_ce,
2871                                                     VTD_PAGE_SIZE);
2872                                 pos = 1;
2873                         }
2874
2875                         if (old_ce)
2876                                 iounmap(old_ce);
2877
2878                         ret = 0;
2879                         if (devfn < 0x80)
2880                                 old_ce_phys = root_entry_lctp(old_re);
2881                         else
2882                                 old_ce_phys = root_entry_uctp(old_re);
2883
2884                         if (!old_ce_phys) {
2885                                 if (ext && devfn == 0) {
2886                                         /* No LCTP, try UCTP */
2887                                         devfn = 0x7f;
2888                                         continue;
2889                                 } else {
2890                                         goto out;
2891                                 }
2892                         }
2893
2894                         ret = -ENOMEM;
2895                         old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2896                         if (!old_ce)
2897                                 goto out;
2898
2899                         new_ce = alloc_pgtable_page(iommu->node);
2900                         if (!new_ce)
2901                                 goto out_unmap;
2902
2903                         ret = 0;
2904                 }
2905
2906                 /* Now copy the context entry */
2907                 ce = old_ce[idx];
2908
2909                 if (!__context_present(&ce))
2910                         continue;
2911
2912                 did = context_domain_id(&ce);
2913                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2914                         set_bit(did, iommu->domain_ids);
2915
2916                 /*
2917                  * We need a marker for copied context entries. This
2918                  * marker needs to work for the old format as well as
2919                  * for extended context entries.
2920                  *
2921                  * Bit 67 of the context entry is used. In the old
2922                  * format this bit is available to software, in the
2923                  * extended format it is the PGE bit, but PGE is ignored
2924                  * by HW if PASIDs are disabled (and thus still
2925                  * available).
2926                  *
2927                  * So disable PASIDs first and then mark the entry
2928                  * copied. This means that we don't copy PASID
2929                  * translations from the old kernel, but this is fine as
2930                  * faults there are not fatal.
2931                  */
2932                 context_clear_pasid_enable(&ce);
2933                 context_set_copied(&ce);
2934
2935                 new_ce[idx] = ce;
2936         }
2937
2938         tbl[tbl_idx + pos] = new_ce;
2939
2940         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2941
2942 out_unmap:
2943         iounmap(old_ce);
2944
2945 out:
2946         return ret;
2947 }
2948
2949 static int copy_translation_tables(struct intel_iommu *iommu)
2950 {
2951         struct context_entry **ctxt_tbls;
2952         struct root_entry *old_rt;
2953         phys_addr_t old_rt_phys;
2954         int ctxt_table_entries;
2955         unsigned long flags;
2956         u64 rtaddr_reg;
2957         int bus, ret;
2958         bool new_ext, ext;
2959
2960         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2961         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2962         new_ext    = !!ecap_ecs(iommu->ecap);
2963
2964         /*
2965          * The RTT bit can only be changed when translation is disabled,
2966          * but disabling translation means to open a window for data
2967          * corruption. So bail out and don't copy anything if we would
2968          * have to change the bit.
2969          */
2970         if (new_ext != ext)
2971                 return -EINVAL;
2972
2973         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2974         if (!old_rt_phys)
2975                 return -EINVAL;
2976
2977         old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2978         if (!old_rt)
2979                 return -ENOMEM;
2980
2981         /* This is too big for the stack - allocate it from slab */
2982         ctxt_table_entries = ext ? 512 : 256;
2983         ret = -ENOMEM;
2984         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2985         if (!ctxt_tbls)
2986                 goto out_unmap;
2987
2988         for (bus = 0; bus < 256; bus++) {
2989                 ret = copy_context_table(iommu, &old_rt[bus],
2990                                          ctxt_tbls, bus, ext);
2991                 if (ret) {
2992                         pr_err("%s: Failed to copy context table for bus %d\n",
2993                                 iommu->name, bus);
2994                         continue;
2995                 }
2996         }
2997
2998         spin_lock_irqsave(&iommu->lock, flags);
2999
3000         /* Context tables are copied, now write them to the root_entry table */
3001         for (bus = 0; bus < 256; bus++) {
3002                 int idx = ext ? bus * 2 : bus;
3003                 u64 val;
3004
3005                 if (ctxt_tbls[idx]) {
3006                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3007                         iommu->root_entry[bus].lo = val;
3008                 }
3009
3010                 if (!ext || !ctxt_tbls[idx + 1])
3011                         continue;
3012
3013                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3014                 iommu->root_entry[bus].hi = val;
3015         }
3016
3017         spin_unlock_irqrestore(&iommu->lock, flags);
3018
3019         kfree(ctxt_tbls);
3020
3021         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3022
3023         ret = 0;
3024
3025 out_unmap:
3026         iounmap(old_rt);
3027
3028         return ret;
3029 }
3030
3031 static int __init init_dmars(void)
3032 {
3033         struct dmar_drhd_unit *drhd;
3034         struct dmar_rmrr_unit *rmrr;
3035         bool copied_tables = false;
3036         struct device *dev;
3037         struct intel_iommu *iommu;
3038         int i, ret;
3039
3040         /*
3041          * for each drhd
3042          *    allocate root
3043          *    initialize and program root entry to not present
3044          * endfor
3045          */
3046         for_each_drhd_unit(drhd) {
3047                 /*
3048                  * lock not needed as this is only incremented in the single
3049                  * threaded kernel __init code path all other access are read
3050                  * only
3051                  */
3052                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3053                         g_num_of_iommus++;
3054                         continue;
3055                 }
3056                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3057         }
3058
3059         /* Preallocate enough resources for IOMMU hot-addition */
3060         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3061                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3062
3063         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3064                         GFP_KERNEL);
3065         if (!g_iommus) {
3066                 pr_err("Allocating global iommu array failed\n");
3067                 ret = -ENOMEM;
3068                 goto error;
3069         }
3070
3071         deferred_flush = kzalloc(g_num_of_iommus *
3072                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3073         if (!deferred_flush) {
3074                 ret = -ENOMEM;
3075                 goto free_g_iommus;
3076         }
3077
3078         for_each_active_iommu(iommu, drhd) {
3079                 g_iommus[iommu->seq_id] = iommu;
3080
3081                 intel_iommu_init_qi(iommu);
3082
3083                 ret = iommu_init_domains(iommu);
3084                 if (ret)
3085                         goto free_iommu;
3086
3087                 init_translation_status(iommu);
3088
3089                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3090                         iommu_disable_translation(iommu);
3091                         clear_translation_pre_enabled(iommu);
3092                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3093                                 iommu->name);
3094                 }
3095
3096                 /*
3097                  * TBD:
3098                  * we could share the same root & context tables
3099                  * among all IOMMU's. Need to Split it later.
3100                  */
3101                 ret = iommu_alloc_root_entry(iommu);
3102                 if (ret)
3103                         goto free_iommu;
3104
3105                 if (translation_pre_enabled(iommu)) {
3106                         pr_info("Translation already enabled - trying to copy translation structures\n");
3107
3108                         ret = copy_translation_tables(iommu);
3109                         if (ret) {
3110                                 /*
3111                                  * We found the IOMMU with translation
3112                                  * enabled - but failed to copy over the
3113                                  * old root-entry table. Try to proceed
3114                                  * by disabling translation now and
3115                                  * allocating a clean root-entry table.
3116                                  * This might cause DMAR faults, but
3117                                  * probably the dump will still succeed.
3118                                  */
3119                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3120                                        iommu->name);
3121                                 iommu_disable_translation(iommu);
3122                                 clear_translation_pre_enabled(iommu);
3123                         } else {
3124                                 pr_info("Copied translation tables from previous kernel for %s\n",
3125                                         iommu->name);
3126                                 copied_tables = true;
3127                         }
3128                 }
3129
3130                 iommu_flush_write_buffer(iommu);
3131                 iommu_set_root_entry(iommu);
3132                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3133                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3134
3135                 if (!ecap_pass_through(iommu->ecap))
3136                         hw_pass_through = 0;
3137         }
3138
3139         if (iommu_pass_through)
3140                 iommu_identity_mapping |= IDENTMAP_ALL;
3141
3142 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3143         iommu_identity_mapping |= IDENTMAP_GFX;
3144 #endif
3145
3146         if (iommu_identity_mapping) {
3147                 ret = si_domain_init(hw_pass_through);
3148                 if (ret)
3149                         goto free_iommu;
3150         }
3151
3152         check_tylersburg_isoch();
3153
3154         /*
3155          * If we copied translations from a previous kernel in the kdump
3156          * case, we can not assign the devices to domains now, as that
3157          * would eliminate the old mappings. So skip this part and defer
3158          * the assignment to device driver initialization time.
3159          */
3160         if (copied_tables)
3161                 goto domains_done;
3162
3163         /*
3164          * If pass through is not set or not enabled, setup context entries for
3165          * identity mappings for rmrr, gfx, and isa and may fall back to static
3166          * identity mapping if iommu_identity_mapping is set.
3167          */
3168         if (iommu_identity_mapping) {
3169                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3170                 if (ret) {
3171                         pr_crit("Failed to setup IOMMU pass-through\n");
3172                         goto free_iommu;
3173                 }
3174         }
3175         /*
3176          * For each rmrr
3177          *   for each dev attached to rmrr
3178          *   do
3179          *     locate drhd for dev, alloc domain for dev
3180          *     allocate free domain
3181          *     allocate page table entries for rmrr
3182          *     if context not allocated for bus
3183          *           allocate and init context
3184          *           set present in root table for this bus
3185          *     init context with domain, translation etc
3186          *    endfor
3187          * endfor
3188          */
3189         pr_info("Setting RMRR:\n");
3190         for_each_rmrr_units(rmrr) {
3191                 /* some BIOS lists non-exist devices in DMAR table. */
3192                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3193                                           i, dev) {
3194                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3195                         if (ret)
3196                                 pr_err("Mapping reserved region failed\n");
3197                 }
3198         }
3199
3200         iommu_prepare_isa();
3201
3202 domains_done:
3203
3204         /*
3205          * for each drhd
3206          *   enable fault log
3207          *   global invalidate context cache
3208          *   global invalidate iotlb
3209          *   enable translation
3210          */
3211         for_each_iommu(iommu, drhd) {
3212                 if (drhd->ignored) {
3213                         /*
3214                          * we always have to disable PMRs or DMA may fail on
3215                          * this device
3216                          */
3217                         if (force_on)
3218                                 iommu_disable_protect_mem_regions(iommu);
3219                         continue;
3220                 }
3221
3222                 iommu_flush_write_buffer(iommu);
3223
3224                 ret = dmar_set_interrupt(iommu);
3225                 if (ret)
3226                         goto free_iommu;
3227
3228                 if (!translation_pre_enabled(iommu))
3229                         iommu_enable_translation(iommu);
3230
3231                 iommu_disable_protect_mem_regions(iommu);
3232         }
3233
3234         return 0;
3235
3236 free_iommu:
3237         for_each_active_iommu(iommu, drhd) {
3238                 disable_dmar_iommu(iommu);
3239                 free_dmar_iommu(iommu);
3240         }
3241         kfree(deferred_flush);
3242 free_g_iommus:
3243         kfree(g_iommus);
3244 error:
3245         return ret;
3246 }
3247
3248 /* This takes a number of _MM_ pages, not VTD pages */
3249 static struct iova *intel_alloc_iova(struct device *dev,
3250                                      struct dmar_domain *domain,
3251                                      unsigned long nrpages, uint64_t dma_mask)
3252 {
3253         struct iova *iova = NULL;
3254
3255         /* Restrict dma_mask to the width that the iommu can handle */
3256         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3257
3258         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3259                 /*
3260                  * First try to allocate an io virtual address in
3261                  * DMA_BIT_MASK(32) and if that fails then try allocating
3262                  * from higher range
3263                  */
3264                 iova = alloc_iova(&domain->iovad, nrpages,
3265                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3266                 if (iova)
3267                         return iova;
3268         }
3269         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3270         if (unlikely(!iova)) {
3271                 pr_err("Allocating %ld-page iova for %s failed",
3272                        nrpages, dev_name(dev));
3273                 return NULL;
3274         }
3275
3276         return iova;
3277 }
3278
3279 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3280 {
3281         struct dmar_domain *domain;
3282         int ret;
3283
3284         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3285         if (!domain) {
3286                 pr_err("Allocating domain for %s failed\n",
3287                        dev_name(dev));
3288                 return NULL;
3289         }
3290
3291         /* make sure context mapping is ok */
3292         if (unlikely(!domain_context_mapped(dev))) {
3293                 ret = domain_context_mapping(domain, dev);
3294                 if (ret) {
3295                         pr_err("Domain context map for %s failed\n",
3296                                dev_name(dev));
3297                         return NULL;
3298                 }
3299         }
3300
3301         return domain;
3302 }
3303
3304 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3305 {
3306         struct device_domain_info *info;
3307
3308         /* No lock here, assumes no domain exit in normal case */
3309         info = dev->archdata.iommu;
3310         if (likely(info))
3311                 return info->domain;
3312
3313         return __get_valid_domain_for_dev(dev);
3314 }
3315
3316 /* Check if the dev needs to go through non-identity map and unmap process.*/
3317 static int iommu_no_mapping(struct device *dev)
3318 {
3319         int found;
3320
3321         if (iommu_dummy(dev))
3322                 return 1;
3323
3324         if (!iommu_identity_mapping)
3325                 return 0;
3326
3327         found = identity_mapping(dev);
3328         if (found) {
3329                 if (iommu_should_identity_map(dev, 0))
3330                         return 1;
3331                 else {
3332                         /*
3333                          * 32 bit DMA is removed from si_domain and fall back
3334                          * to non-identity mapping.
3335                          */
3336                         domain_remove_one_dev_info(si_domain, dev);
3337                         pr_info("32bit %s uses non-identity mapping\n",
3338                                 dev_name(dev));
3339                         return 0;
3340                 }
3341         } else {
3342                 /*
3343                  * In case of a detached 64 bit DMA device from vm, the device
3344                  * is put into si_domain for identity mapping.
3345                  */
3346                 if (iommu_should_identity_map(dev, 0)) {
3347                         int ret;
3348                         ret = domain_add_dev_info(si_domain, dev);
3349                         if (!ret) {
3350                                 pr_info("64bit %s uses identity mapping\n",
3351                                         dev_name(dev));
3352                                 return 1;
3353                         }
3354                 }
3355         }
3356
3357         return 0;
3358 }
3359
3360 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3361                                      size_t size, int dir, u64 dma_mask)
3362 {
3363         struct dmar_domain *domain;
3364         phys_addr_t start_paddr;
3365         struct iova *iova;
3366         int prot = 0;
3367         int ret;
3368         struct intel_iommu *iommu;
3369         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3370
3371         BUG_ON(dir == DMA_NONE);
3372
3373         if (iommu_no_mapping(dev))
3374                 return paddr;
3375
3376         domain = get_valid_domain_for_dev(dev);
3377         if (!domain)
3378                 return 0;
3379
3380         iommu = domain_get_iommu(domain);
3381         size = aligned_nrpages(paddr, size);
3382
3383         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3384         if (!iova)
3385                 goto error;
3386
3387         /*
3388          * Check if DMAR supports zero-length reads on write only
3389          * mappings..
3390          */
3391         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3392                         !cap_zlr(iommu->cap))
3393                 prot |= DMA_PTE_READ;
3394         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3395                 prot |= DMA_PTE_WRITE;
3396         /*
3397          * paddr - (paddr + size) might be partial page, we should map the whole
3398          * page.  Note: if two part of one page are separately mapped, we
3399          * might have two guest_addr mapping to the same host paddr, but this
3400          * is not a big problem
3401          */
3402         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3403                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3404         if (ret)
3405                 goto error;
3406
3407         /* it's a non-present to present mapping. Only flush if caching mode */
3408         if (cap_caching_mode(iommu->cap))
3409                 iommu_flush_iotlb_psi(iommu, domain,
3410                                       mm_to_dma_pfn(iova->pfn_lo),
3411                                       size, 0, 1);
3412         else
3413                 iommu_flush_write_buffer(iommu);
3414
3415         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3416         start_paddr += paddr & ~PAGE_MASK;
3417         return start_paddr;
3418
3419 error:
3420         if (iova)
3421                 __free_iova(&domain->iovad, iova);
3422         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3423                 dev_name(dev), size, (unsigned long long)paddr, dir);
3424         return 0;
3425 }
3426
3427 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3428                                  unsigned long offset, size_t size,
3429                                  enum dma_data_direction dir,
3430                                  struct dma_attrs *attrs)
3431 {
3432         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3433                                   dir, *dev->dma_mask);
3434 }
3435
3436 static void flush_unmaps(void)
3437 {
3438         int i, j;
3439
3440         timer_on = 0;
3441
3442         /* just flush them all */
3443         for (i = 0; i < g_num_of_iommus; i++) {
3444                 struct intel_iommu *iommu = g_iommus[i];
3445                 if (!iommu)
3446                         continue;
3447
3448                 if (!deferred_flush[i].next)
3449                         continue;
3450
3451                 /* In caching mode, global flushes turn emulation expensive */
3452                 if (!cap_caching_mode(iommu->cap))
3453                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3454                                          DMA_TLB_GLOBAL_FLUSH);
3455                 for (j = 0; j < deferred_flush[i].next; j++) {
3456                         unsigned long mask;
3457                         struct iova *iova = deferred_flush[i].iova[j];
3458                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3459
3460                         /* On real hardware multiple invalidations are expensive */
3461                         if (cap_caching_mode(iommu->cap))
3462                                 iommu_flush_iotlb_psi(iommu, domain,
3463                                         iova->pfn_lo, iova_size(iova),
3464                                         !deferred_flush[i].freelist[j], 0);
3465                         else {
3466                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3467                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3468                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3469                         }
3470                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3471                         if (deferred_flush[i].freelist[j])
3472                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3473                 }
3474                 deferred_flush[i].next = 0;
3475         }
3476
3477         list_size = 0;
3478 }
3479
3480 static void flush_unmaps_timeout(unsigned long data)
3481 {
3482         unsigned long flags;
3483
3484         spin_lock_irqsave(&async_umap_flush_lock, flags);
3485         flush_unmaps();
3486         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3487 }
3488
3489 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3490 {
3491         unsigned long flags;
3492         int next, iommu_id;
3493         struct intel_iommu *iommu;
3494
3495         spin_lock_irqsave(&async_umap_flush_lock, flags);
3496         if (list_size == HIGH_WATER_MARK)
3497                 flush_unmaps();
3498
3499         iommu = domain_get_iommu(dom);
3500         iommu_id = iommu->seq_id;
3501
3502         next = deferred_flush[iommu_id].next;
3503         deferred_flush[iommu_id].domain[next] = dom;
3504         deferred_flush[iommu_id].iova[next] = iova;
3505         deferred_flush[iommu_id].freelist[next] = freelist;
3506         deferred_flush[iommu_id].next++;
3507
3508         if (!timer_on) {
3509                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3510                 timer_on = 1;
3511         }
3512         list_size++;
3513         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3514 }
3515
3516 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3517 {
3518         struct dmar_domain *domain;
3519         unsigned long start_pfn, last_pfn;
3520         struct iova *iova;
3521         struct intel_iommu *iommu;
3522         struct page *freelist;
3523
3524         if (iommu_no_mapping(dev))
3525                 return;
3526
3527         domain = find_domain(dev);
3528         BUG_ON(!domain);
3529
3530         iommu = domain_get_iommu(domain);
3531
3532         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3533         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3534                       (unsigned long long)dev_addr))
3535                 return;
3536
3537         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3538         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3539
3540         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3541                  dev_name(dev), start_pfn, last_pfn);
3542
3543         freelist = domain_unmap(domain, start_pfn, last_pfn);
3544
3545         if (intel_iommu_strict) {
3546                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3547                                       last_pfn - start_pfn + 1, !freelist, 0);
3548                 /* free iova */
3549                 __free_iova(&domain->iovad, iova);
3550                 dma_free_pagelist(freelist);
3551         } else {
3552                 add_unmap(domain, iova, freelist);
3553                 /*
3554                  * queue up the release of the unmap to save the 1/6th of the
3555                  * cpu used up by the iotlb flush operation...
3556                  */
3557         }
3558 }
3559
3560 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3561                              size_t size, enum dma_data_direction dir,
3562                              struct dma_attrs *attrs)
3563 {
3564         intel_unmap(dev, dev_addr);
3565 }
3566
3567 static void *intel_alloc_coherent(struct device *dev, size_t size,
3568                                   dma_addr_t *dma_handle, gfp_t flags,
3569                                   struct dma_attrs *attrs)
3570 {
3571         struct page *page = NULL;
3572         int order;
3573
3574         size = PAGE_ALIGN(size);
3575         order = get_order(size);
3576
3577         if (!iommu_no_mapping(dev))
3578                 flags &= ~(GFP_DMA | GFP_DMA32);
3579         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3580                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3581                         flags |= GFP_DMA;
3582                 else
3583                         flags |= GFP_DMA32;
3584         }
3585
3586         if (flags & __GFP_WAIT) {
3587                 unsigned int count = size >> PAGE_SHIFT;
3588
3589                 page = dma_alloc_from_contiguous(dev, count, order);
3590                 if (page && iommu_no_mapping(dev) &&
3591                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3592                         dma_release_from_contiguous(dev, page, count);
3593                         page = NULL;
3594                 }
3595         }
3596
3597         if (!page)
3598                 page = alloc_pages(flags, order);
3599         if (!page)
3600                 return NULL;
3601         memset(page_address(page), 0, size);
3602
3603         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3604                                          DMA_BIDIRECTIONAL,
3605                                          dev->coherent_dma_mask);
3606         if (*dma_handle)
3607                 return page_address(page);
3608         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3609                 __free_pages(page, order);
3610
3611         return NULL;
3612 }
3613
3614 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3615                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3616 {
3617         int order;
3618         struct page *page = virt_to_page(vaddr);
3619
3620         size = PAGE_ALIGN(size);
3621         order = get_order(size);
3622
3623         intel_unmap(dev, dma_handle);
3624         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3625                 __free_pages(page, order);
3626 }
3627
3628 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3629                            int nelems, enum dma_data_direction dir,
3630                            struct dma_attrs *attrs)
3631 {
3632         intel_unmap(dev, sglist[0].dma_address);
3633 }
3634
3635 static int intel_nontranslate_map_sg(struct device *hddev,
3636         struct scatterlist *sglist, int nelems, int dir)
3637 {
3638         int i;
3639         struct scatterlist *sg;
3640
3641         for_each_sg(sglist, sg, nelems, i) {
3642                 BUG_ON(!sg_page(sg));
3643                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3644                 sg->dma_length = sg->length;
3645         }
3646         return nelems;
3647 }
3648
3649 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3650                         enum dma_data_direction dir, struct dma_attrs *attrs)
3651 {
3652         int i;
3653         struct dmar_domain *domain;
3654         size_t size = 0;
3655         int prot = 0;
3656         struct iova *iova = NULL;
3657         int ret;
3658         struct scatterlist *sg;
3659         unsigned long start_vpfn;
3660         struct intel_iommu *iommu;
3661
3662         BUG_ON(dir == DMA_NONE);
3663         if (iommu_no_mapping(dev))
3664                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3665
3666         domain = get_valid_domain_for_dev(dev);
3667         if (!domain)
3668                 return 0;
3669
3670         iommu = domain_get_iommu(domain);
3671
3672         for_each_sg(sglist, sg, nelems, i)
3673                 size += aligned_nrpages(sg->offset, sg->length);
3674
3675         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3676                                 *dev->dma_mask);
3677         if (!iova) {
3678                 sglist->dma_length = 0;
3679                 return 0;
3680         }
3681
3682         /*
3683          * Check if DMAR supports zero-length reads on write only
3684          * mappings..
3685          */
3686         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3687                         !cap_zlr(iommu->cap))
3688                 prot |= DMA_PTE_READ;
3689         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3690                 prot |= DMA_PTE_WRITE;
3691
3692         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3693
3694         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3695         if (unlikely(ret)) {
3696                 dma_pte_free_pagetable(domain, start_vpfn,
3697                                        start_vpfn + size - 1);
3698                 __free_iova(&domain->iovad, iova);
3699                 return 0;
3700         }
3701
3702         /* it's a non-present to present mapping. Only flush if caching mode */
3703         if (cap_caching_mode(iommu->cap))
3704                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3705         else
3706                 iommu_flush_write_buffer(iommu);
3707
3708         return nelems;
3709 }
3710
3711 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3712 {
3713         return !dma_addr;
3714 }
3715
3716 struct dma_map_ops intel_dma_ops = {
3717         .alloc = intel_alloc_coherent,
3718         .free = intel_free_coherent,
3719         .map_sg = intel_map_sg,
3720         .unmap_sg = intel_unmap_sg,
3721         .map_page = intel_map_page,
3722         .unmap_page = intel_unmap_page,
3723         .mapping_error = intel_mapping_error,
3724 };
3725
3726 static inline int iommu_domain_cache_init(void)
3727 {
3728         int ret = 0;
3729
3730         iommu_domain_cache = kmem_cache_create("iommu_domain",
3731                                          sizeof(struct dmar_domain),
3732                                          0,
3733                                          SLAB_HWCACHE_ALIGN,
3734
3735                                          NULL);
3736         if (!iommu_domain_cache) {
3737                 pr_err("Couldn't create iommu_domain cache\n");
3738                 ret = -ENOMEM;
3739         }
3740
3741         return ret;
3742 }
3743
3744 static inline int iommu_devinfo_cache_init(void)
3745 {
3746         int ret = 0;
3747
3748         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3749                                          sizeof(struct device_domain_info),
3750                                          0,
3751                                          SLAB_HWCACHE_ALIGN,
3752                                          NULL);
3753         if (!iommu_devinfo_cache) {
3754                 pr_err("Couldn't create devinfo cache\n");
3755                 ret = -ENOMEM;
3756         }
3757
3758         return ret;
3759 }
3760
3761 static int __init iommu_init_mempool(void)
3762 {
3763         int ret;
3764         ret = iommu_iova_cache_init();
3765         if (ret)
3766                 return ret;
3767
3768         ret = iommu_domain_cache_init();
3769         if (ret)
3770                 goto domain_error;
3771
3772         ret = iommu_devinfo_cache_init();
3773         if (!ret)
3774                 return ret;
3775
3776         kmem_cache_destroy(iommu_domain_cache);
3777 domain_error:
3778         iommu_iova_cache_destroy();
3779
3780         return -ENOMEM;
3781 }
3782
3783 static void __init iommu_exit_mempool(void)
3784 {
3785         kmem_cache_destroy(iommu_devinfo_cache);
3786         kmem_cache_destroy(iommu_domain_cache);
3787         iommu_iova_cache_destroy();
3788 }
3789
3790 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3791 {
3792         struct dmar_drhd_unit *drhd;
3793         u32 vtbar;
3794         int rc;
3795
3796         /* We know that this device on this chipset has its own IOMMU.
3797          * If we find it under a different IOMMU, then the BIOS is lying
3798          * to us. Hope that the IOMMU for this device is actually
3799          * disabled, and it needs no translation...
3800          */
3801         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3802         if (rc) {
3803                 /* "can't" happen */
3804                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3805                 return;
3806         }
3807         vtbar &= 0xffff0000;
3808
3809         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3810         drhd = dmar_find_matched_drhd_unit(pdev);
3811         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3812                             TAINT_FIRMWARE_WORKAROUND,
3813                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3814                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3815 }
3816 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3817
3818 static void __init init_no_remapping_devices(void)
3819 {
3820         struct dmar_drhd_unit *drhd;
3821         struct device *dev;
3822         int i;
3823
3824         for_each_drhd_unit(drhd) {
3825                 if (!drhd->include_all) {
3826                         for_each_active_dev_scope(drhd->devices,
3827                                                   drhd->devices_cnt, i, dev)
3828                                 break;
3829                         /* ignore DMAR unit if no devices exist */
3830                         if (i == drhd->devices_cnt)
3831                                 drhd->ignored = 1;
3832                 }
3833         }
3834
3835         for_each_active_drhd_unit(drhd) {
3836                 if (drhd->include_all)
3837                         continue;
3838
3839                 for_each_active_dev_scope(drhd->devices,
3840                                           drhd->devices_cnt, i, dev)
3841                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3842                                 break;
3843                 if (i < drhd->devices_cnt)
3844                         continue;
3845
3846                 /* This IOMMU has *only* gfx devices. Either bypass it or
3847                    set the gfx_mapped flag, as appropriate */
3848                 if (dmar_map_gfx) {
3849                         intel_iommu_gfx_mapped = 1;
3850                 } else {
3851                         drhd->ignored = 1;
3852                         for_each_active_dev_scope(drhd->devices,
3853                                                   drhd->devices_cnt, i, dev)
3854                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3855                 }
3856         }
3857 }
3858
3859 #ifdef CONFIG_SUSPEND
3860 static int init_iommu_hw(void)
3861 {
3862         struct dmar_drhd_unit *drhd;
3863         struct intel_iommu *iommu = NULL;
3864
3865         for_each_active_iommu(iommu, drhd)
3866                 if (iommu->qi)
3867                         dmar_reenable_qi(iommu);
3868
3869         for_each_iommu(iommu, drhd) {
3870                 if (drhd->ignored) {
3871                         /*
3872                          * we always have to disable PMRs or DMA may fail on
3873                          * this device
3874                          */
3875                         if (force_on)
3876                                 iommu_disable_protect_mem_regions(iommu);
3877                         continue;
3878                 }
3879         
3880                 iommu_flush_write_buffer(iommu);
3881
3882                 iommu_set_root_entry(iommu);
3883
3884                 iommu->flush.flush_context(iommu, 0, 0, 0,
3885                                            DMA_CCMD_GLOBAL_INVL);
3886                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3887                 iommu_enable_translation(iommu);
3888                 iommu_disable_protect_mem_regions(iommu);
3889         }
3890
3891         return 0;
3892 }
3893
3894 static void iommu_flush_all(void)
3895 {
3896         struct dmar_drhd_unit *drhd;
3897         struct intel_iommu *iommu;
3898
3899         for_each_active_iommu(iommu, drhd) {
3900                 iommu->flush.flush_context(iommu, 0, 0, 0,
3901                                            DMA_CCMD_GLOBAL_INVL);
3902                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3903                                          DMA_TLB_GLOBAL_FLUSH);
3904         }
3905 }
3906
3907 static int iommu_suspend(void)
3908 {
3909         struct dmar_drhd_unit *drhd;
3910         struct intel_iommu *iommu = NULL;
3911         unsigned long flag;
3912
3913         for_each_active_iommu(iommu, drhd) {
3914                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3915                                                  GFP_ATOMIC);
3916                 if (!iommu->iommu_state)
3917                         goto nomem;
3918         }
3919
3920         iommu_flush_all();
3921
3922         for_each_active_iommu(iommu, drhd) {
3923                 iommu_disable_translation(iommu);
3924
3925                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3926
3927                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3928                         readl(iommu->reg + DMAR_FECTL_REG);
3929                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3930                         readl(iommu->reg + DMAR_FEDATA_REG);
3931                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3932                         readl(iommu->reg + DMAR_FEADDR_REG);
3933                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3934                         readl(iommu->reg + DMAR_FEUADDR_REG);
3935
3936                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3937         }
3938         return 0;
3939
3940 nomem:
3941         for_each_active_iommu(iommu, drhd)
3942                 kfree(iommu->iommu_state);
3943
3944         return -ENOMEM;
3945 }
3946
3947 static void iommu_resume(void)
3948 {
3949         struct dmar_drhd_unit *drhd;
3950         struct intel_iommu *iommu = NULL;
3951         unsigned long flag;
3952
3953         if (init_iommu_hw()) {
3954                 if (force_on)
3955                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3956                 else
3957                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3958                 return;
3959         }
3960
3961         for_each_active_iommu(iommu, drhd) {
3962
3963                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3964
3965                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3966                         iommu->reg + DMAR_FECTL_REG);
3967                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3968                         iommu->reg + DMAR_FEDATA_REG);
3969                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3970                         iommu->reg + DMAR_FEADDR_REG);
3971                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3972                         iommu->reg + DMAR_FEUADDR_REG);
3973
3974                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3975         }
3976
3977         for_each_active_iommu(iommu, drhd)
3978                 kfree(iommu->iommu_state);
3979 }
3980
3981 static struct syscore_ops iommu_syscore_ops = {
3982         .resume         = iommu_resume,
3983         .suspend        = iommu_suspend,
3984 };
3985
3986 static void __init init_iommu_pm_ops(void)
3987 {
3988         register_syscore_ops(&iommu_syscore_ops);
3989 }
3990
3991 #else
3992 static inline void init_iommu_pm_ops(void) {}
3993 #endif  /* CONFIG_PM */
3994
3995
3996 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3997 {
3998         struct acpi_dmar_reserved_memory *rmrr;
3999         struct dmar_rmrr_unit *rmrru;
4000
4001         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4002         if (!rmrru)
4003                 return -ENOMEM;
4004
4005         rmrru->hdr = header;
4006         rmrr = (struct acpi_dmar_reserved_memory *)header;
4007         rmrru->base_address = rmrr->base_address;
4008         rmrru->end_address = rmrr->end_address;
4009         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4010                                 ((void *)rmrr) + rmrr->header.length,
4011                                 &rmrru->devices_cnt);
4012         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4013                 kfree(rmrru);
4014                 return -ENOMEM;
4015         }
4016
4017         list_add(&rmrru->list, &dmar_rmrr_units);
4018
4019         return 0;
4020 }
4021
4022 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4023 {
4024         struct dmar_atsr_unit *atsru;
4025         struct acpi_dmar_atsr *tmp;
4026
4027         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4028                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4029                 if (atsr->segment != tmp->segment)
4030                         continue;
4031                 if (atsr->header.length != tmp->header.length)
4032                         continue;
4033                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4034                         return atsru;
4035         }
4036
4037         return NULL;
4038 }
4039
4040 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4041 {
4042         struct acpi_dmar_atsr *atsr;
4043         struct dmar_atsr_unit *atsru;
4044
4045         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4046                 return 0;
4047
4048         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4049         atsru = dmar_find_atsr(atsr);
4050         if (atsru)
4051                 return 0;
4052
4053         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4054         if (!atsru)
4055                 return -ENOMEM;
4056
4057         /*
4058          * If memory is allocated from slab by ACPI _DSM method, we need to
4059          * copy the memory content because the memory buffer will be freed
4060          * on return.
4061          */
4062         atsru->hdr = (void *)(atsru + 1);
4063         memcpy(atsru->hdr, hdr, hdr->length);
4064         atsru->include_all = atsr->flags & 0x1;
4065         if (!atsru->include_all) {
4066                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4067                                 (void *)atsr + atsr->header.length,
4068                                 &atsru->devices_cnt);
4069                 if (atsru->devices_cnt && atsru->devices == NULL) {
4070                         kfree(atsru);
4071                         return -ENOMEM;
4072                 }
4073         }
4074
4075         list_add_rcu(&atsru->list, &dmar_atsr_units);
4076
4077         return 0;
4078 }
4079
4080 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4081 {
4082         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4083         kfree(atsru);
4084 }
4085
4086 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4087 {
4088         struct acpi_dmar_atsr *atsr;
4089         struct dmar_atsr_unit *atsru;
4090
4091         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4092         atsru = dmar_find_atsr(atsr);
4093         if (atsru) {
4094                 list_del_rcu(&atsru->list);
4095                 synchronize_rcu();
4096                 intel_iommu_free_atsr(atsru);
4097         }
4098
4099         return 0;
4100 }
4101
4102 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4103 {
4104         int i;
4105         struct device *dev;
4106         struct acpi_dmar_atsr *atsr;
4107         struct dmar_atsr_unit *atsru;
4108
4109         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4110         atsru = dmar_find_atsr(atsr);
4111         if (!atsru)
4112                 return 0;
4113
4114         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4115                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4116                                           i, dev)
4117                         return -EBUSY;
4118
4119         return 0;
4120 }
4121
4122 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4123 {
4124         int sp, ret = 0;
4125         struct intel_iommu *iommu = dmaru->iommu;
4126
4127         if (g_iommus[iommu->seq_id])
4128                 return 0;
4129
4130         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4131                 pr_warn("%s: Doesn't support hardware pass through.\n",
4132                         iommu->name);
4133                 return -ENXIO;
4134         }
4135         if (!ecap_sc_support(iommu->ecap) &&
4136             domain_update_iommu_snooping(iommu)) {
4137                 pr_warn("%s: Doesn't support snooping.\n",
4138                         iommu->name);
4139                 return -ENXIO;
4140         }
4141         sp = domain_update_iommu_superpage(iommu) - 1;
4142         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4143                 pr_warn("%s: Doesn't support large page.\n",
4144                         iommu->name);
4145                 return -ENXIO;
4146         }
4147
4148         /*
4149          * Disable translation if already enabled prior to OS handover.
4150          */
4151         if (iommu->gcmd & DMA_GCMD_TE)
4152                 iommu_disable_translation(iommu);
4153
4154         g_iommus[iommu->seq_id] = iommu;
4155         ret = iommu_init_domains(iommu);
4156         if (ret == 0)
4157                 ret = iommu_alloc_root_entry(iommu);
4158         if (ret)
4159                 goto out;
4160
4161         if (dmaru->ignored) {
4162                 /*
4163                  * we always have to disable PMRs or DMA may fail on this device
4164                  */
4165                 if (force_on)
4166                         iommu_disable_protect_mem_regions(iommu);
4167                 return 0;
4168         }
4169
4170         intel_iommu_init_qi(iommu);
4171         iommu_flush_write_buffer(iommu);
4172         ret = dmar_set_interrupt(iommu);
4173         if (ret)
4174                 goto disable_iommu;
4175
4176         iommu_set_root_entry(iommu);
4177         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4178         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4179         iommu_enable_translation(iommu);
4180
4181         iommu_disable_protect_mem_regions(iommu);
4182         return 0;
4183
4184 disable_iommu:
4185         disable_dmar_iommu(iommu);
4186 out:
4187         free_dmar_iommu(iommu);
4188         return ret;
4189 }
4190
4191 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4192 {
4193         int ret = 0;
4194         struct intel_iommu *iommu = dmaru->iommu;
4195
4196         if (!intel_iommu_enabled)
4197                 return 0;
4198         if (iommu == NULL)
4199                 return -EINVAL;
4200
4201         if (insert) {
4202                 ret = intel_iommu_add(dmaru);
4203         } else {
4204                 disable_dmar_iommu(iommu);
4205                 free_dmar_iommu(iommu);
4206         }
4207
4208         return ret;
4209 }
4210
4211 static void intel_iommu_free_dmars(void)
4212 {
4213         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4214         struct dmar_atsr_unit *atsru, *atsr_n;
4215
4216         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4217                 list_del(&rmrru->list);
4218                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4219                 kfree(rmrru);
4220         }
4221
4222         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4223                 list_del(&atsru->list);
4224                 intel_iommu_free_atsr(atsru);
4225         }
4226 }
4227
4228 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4229 {
4230         int i, ret = 1;
4231         struct pci_bus *bus;
4232         struct pci_dev *bridge = NULL;
4233         struct device *tmp;
4234         struct acpi_dmar_atsr *atsr;
4235         struct dmar_atsr_unit *atsru;
4236
4237         dev = pci_physfn(dev);
4238         for (bus = dev->bus; bus; bus = bus->parent) {
4239                 bridge = bus->self;
4240                 if (!bridge || !pci_is_pcie(bridge) ||
4241                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4242                         return 0;
4243                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4244                         break;
4245         }
4246         if (!bridge)
4247                 return 0;
4248
4249         rcu_read_lock();
4250         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4251                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4252                 if (atsr->segment != pci_domain_nr(dev->bus))
4253                         continue;
4254
4255                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4256                         if (tmp == &bridge->dev)
4257                                 goto out;
4258
4259                 if (atsru->include_all)
4260                         goto out;
4261         }
4262         ret = 0;
4263 out:
4264         rcu_read_unlock();
4265
4266         return ret;
4267 }
4268
4269 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4270 {
4271         int ret = 0;
4272         struct dmar_rmrr_unit *rmrru;
4273         struct dmar_atsr_unit *atsru;
4274         struct acpi_dmar_atsr *atsr;
4275         struct acpi_dmar_reserved_memory *rmrr;
4276
4277         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4278                 return 0;
4279
4280         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4281                 rmrr = container_of(rmrru->hdr,
4282                                     struct acpi_dmar_reserved_memory, header);
4283                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4284                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4285                                 ((void *)rmrr) + rmrr->header.length,
4286                                 rmrr->segment, rmrru->devices,
4287                                 rmrru->devices_cnt);
4288                         if(ret < 0)
4289                                 return ret;
4290                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4291                         dmar_remove_dev_scope(info, rmrr->segment,
4292                                 rmrru->devices, rmrru->devices_cnt);
4293                 }
4294         }
4295
4296         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4297                 if (atsru->include_all)
4298                         continue;
4299
4300                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4301                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4302                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4303                                         (void *)atsr + atsr->header.length,
4304                                         atsr->segment, atsru->devices,
4305                                         atsru->devices_cnt);
4306                         if (ret > 0)
4307                                 break;
4308                         else if(ret < 0)
4309                                 return ret;
4310                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4311                         if (dmar_remove_dev_scope(info, atsr->segment,
4312                                         atsru->devices, atsru->devices_cnt))
4313                                 break;
4314                 }
4315         }
4316
4317         return 0;
4318 }
4319
4320 /*
4321  * Here we only respond to action of unbound device from driver.
4322  *
4323  * Added device is not attached to its DMAR domain here yet. That will happen
4324  * when mapping the device to iova.
4325  */
4326 static int device_notifier(struct notifier_block *nb,
4327                                   unsigned long action, void *data)
4328 {
4329         struct device *dev = data;
4330         struct dmar_domain *domain;
4331
4332         if (iommu_dummy(dev))
4333                 return 0;
4334
4335         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4336                 return 0;
4337
4338         domain = find_domain(dev);
4339         if (!domain)
4340                 return 0;
4341
4342         down_read(&dmar_global_lock);
4343         domain_remove_one_dev_info(domain, dev);
4344         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4345                 domain_exit(domain);
4346         up_read(&dmar_global_lock);
4347
4348         return 0;
4349 }
4350
4351 static struct notifier_block device_nb = {
4352         .notifier_call = device_notifier,
4353 };
4354
4355 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4356                                        unsigned long val, void *v)
4357 {
4358         struct memory_notify *mhp = v;
4359         unsigned long long start, end;
4360         unsigned long start_vpfn, last_vpfn;
4361
4362         switch (val) {
4363         case MEM_GOING_ONLINE:
4364                 start = mhp->start_pfn << PAGE_SHIFT;
4365                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4366                 if (iommu_domain_identity_map(si_domain, start, end)) {
4367                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4368                                 start, end);
4369                         return NOTIFY_BAD;
4370                 }
4371                 break;
4372
4373         case MEM_OFFLINE:
4374         case MEM_CANCEL_ONLINE:
4375                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4376                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4377                 while (start_vpfn <= last_vpfn) {
4378                         struct iova *iova;
4379                         struct dmar_drhd_unit *drhd;
4380                         struct intel_iommu *iommu;
4381                         struct page *freelist;
4382
4383                         iova = find_iova(&si_domain->iovad, start_vpfn);
4384                         if (iova == NULL) {
4385                                 pr_debug("Failed get IOVA for PFN %lx\n",
4386                                          start_vpfn);
4387                                 break;
4388                         }
4389
4390                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4391                                                      start_vpfn, last_vpfn);
4392                         if (iova == NULL) {
4393                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4394                                         start_vpfn, last_vpfn);
4395                                 return NOTIFY_BAD;
4396                         }
4397
4398                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4399                                                iova->pfn_hi);
4400
4401                         rcu_read_lock();
4402                         for_each_active_iommu(iommu, drhd)
4403                                 iommu_flush_iotlb_psi(iommu, si_domain,
4404                                         iova->pfn_lo, iova_size(iova),
4405                                         !freelist, 0);
4406                         rcu_read_unlock();
4407                         dma_free_pagelist(freelist);
4408
4409                         start_vpfn = iova->pfn_hi + 1;
4410                         free_iova_mem(iova);
4411                 }
4412                 break;
4413         }
4414
4415         return NOTIFY_OK;
4416 }
4417
4418 static struct notifier_block intel_iommu_memory_nb = {
4419         .notifier_call = intel_iommu_memory_notifier,
4420         .priority = 0
4421 };
4422
4423
4424 static ssize_t intel_iommu_show_version(struct device *dev,
4425                                         struct device_attribute *attr,
4426                                         char *buf)
4427 {
4428         struct intel_iommu *iommu = dev_get_drvdata(dev);
4429         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4430         return sprintf(buf, "%d:%d\n",
4431                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4432 }
4433 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4434
4435 static ssize_t intel_iommu_show_address(struct device *dev,
4436                                         struct device_attribute *attr,
4437                                         char *buf)
4438 {
4439         struct intel_iommu *iommu = dev_get_drvdata(dev);
4440         return sprintf(buf, "%llx\n", iommu->reg_phys);
4441 }
4442 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4443
4444 static ssize_t intel_iommu_show_cap(struct device *dev,
4445                                     struct device_attribute *attr,
4446                                     char *buf)
4447 {
4448         struct intel_iommu *iommu = dev_get_drvdata(dev);
4449         return sprintf(buf, "%llx\n", iommu->cap);
4450 }
4451 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4452
4453 static ssize_t intel_iommu_show_ecap(struct device *dev,
4454                                     struct device_attribute *attr,
4455                                     char *buf)
4456 {
4457         struct intel_iommu *iommu = dev_get_drvdata(dev);
4458         return sprintf(buf, "%llx\n", iommu->ecap);
4459 }
4460 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4461
4462 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4463                                       struct device_attribute *attr,
4464                                       char *buf)
4465 {
4466         struct intel_iommu *iommu = dev_get_drvdata(dev);
4467         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4468 }
4469 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4470
4471 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4472                                            struct device_attribute *attr,
4473                                            char *buf)
4474 {
4475         struct intel_iommu *iommu = dev_get_drvdata(dev);
4476         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4477                                                   cap_ndoms(iommu->cap)));
4478 }
4479 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4480
4481 static struct attribute *intel_iommu_attrs[] = {
4482         &dev_attr_version.attr,
4483         &dev_attr_address.attr,
4484         &dev_attr_cap.attr,
4485         &dev_attr_ecap.attr,
4486         &dev_attr_domains_supported.attr,
4487         &dev_attr_domains_used.attr,
4488         NULL,
4489 };
4490
4491 static struct attribute_group intel_iommu_group = {
4492         .name = "intel-iommu",
4493         .attrs = intel_iommu_attrs,
4494 };
4495
4496 const struct attribute_group *intel_iommu_groups[] = {
4497         &intel_iommu_group,
4498         NULL,
4499 };
4500
4501 int __init intel_iommu_init(void)
4502 {
4503         int ret = -ENODEV;
4504         struct dmar_drhd_unit *drhd;
4505         struct intel_iommu *iommu;
4506
4507         /* VT-d is required for a TXT/tboot launch, so enforce that */
4508         force_on = tboot_force_iommu();
4509
4510         if (iommu_init_mempool()) {
4511                 if (force_on)
4512                         panic("tboot: Failed to initialize iommu memory\n");
4513                 return -ENOMEM;
4514         }
4515
4516         down_write(&dmar_global_lock);
4517         if (dmar_table_init()) {
4518                 if (force_on)
4519                         panic("tboot: Failed to initialize DMAR table\n");
4520                 goto out_free_dmar;
4521         }
4522
4523         if (dmar_dev_scope_init() < 0) {
4524                 if (force_on)
4525                         panic("tboot: Failed to initialize DMAR device scope\n");
4526                 goto out_free_dmar;
4527         }
4528
4529         if (no_iommu || dmar_disabled)
4530                 goto out_free_dmar;
4531
4532         if (list_empty(&dmar_rmrr_units))
4533                 pr_info("No RMRR found\n");
4534
4535         if (list_empty(&dmar_atsr_units))
4536                 pr_info("No ATSR found\n");
4537
4538         if (dmar_init_reserved_ranges()) {
4539                 if (force_on)
4540                         panic("tboot: Failed to reserve iommu ranges\n");
4541                 goto out_free_reserved_range;
4542         }
4543
4544         init_no_remapping_devices();
4545
4546         ret = init_dmars();
4547         if (ret) {
4548                 if (force_on)
4549                         panic("tboot: Failed to initialize DMARs\n");
4550                 pr_err("Initialization failed\n");
4551                 goto out_free_reserved_range;
4552         }
4553         up_write(&dmar_global_lock);
4554         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4555
4556         init_timer(&unmap_timer);
4557 #ifdef CONFIG_SWIOTLB
4558         swiotlb = 0;
4559 #endif
4560         dma_ops = &intel_dma_ops;
4561
4562         init_iommu_pm_ops();
4563
4564         for_each_active_iommu(iommu, drhd)
4565                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4566                                                        intel_iommu_groups,
4567                                                        "%s", iommu->name);
4568
4569         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4570         bus_register_notifier(&pci_bus_type, &device_nb);
4571         if (si_domain && !hw_pass_through)
4572                 register_memory_notifier(&intel_iommu_memory_nb);
4573
4574         intel_iommu_enabled = 1;
4575
4576         return 0;
4577
4578 out_free_reserved_range:
4579         put_iova_domain(&reserved_iova_list);
4580 out_free_dmar:
4581         intel_iommu_free_dmars();
4582         up_write(&dmar_global_lock);
4583         iommu_exit_mempool();
4584         return ret;
4585 }
4586
4587 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4588 {
4589         struct intel_iommu *iommu = opaque;
4590
4591         iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4592         return 0;
4593 }
4594
4595 /*
4596  * NB - intel-iommu lacks any sort of reference counting for the users of
4597  * dependent devices.  If multiple endpoints have intersecting dependent
4598  * devices, unbinding the driver from any one of them will possibly leave
4599  * the others unable to operate.
4600  */
4601 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4602                                            struct device *dev)
4603 {
4604         if (!iommu || !dev || !dev_is_pci(dev))
4605                 return;
4606
4607         pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4608 }
4609
4610 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4611                                        struct device *dev)
4612 {
4613         struct device_domain_info *info, *tmp;
4614         struct intel_iommu *iommu;
4615         unsigned long flags;
4616         bool found = false;
4617         u8 bus, devfn;
4618
4619         iommu = device_to_iommu(dev, &bus, &devfn);
4620         if (!iommu)
4621                 return;
4622
4623         spin_lock_irqsave(&device_domain_lock, flags);
4624         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4625                 if (info->iommu == iommu && info->bus == bus &&
4626                     info->devfn == devfn) {
4627                         unlink_domain_info(info);
4628                         spin_unlock_irqrestore(&device_domain_lock, flags);
4629
4630                         iommu_disable_dev_iotlb(info);
4631                         iommu_detach_dev(iommu, info->bus, info->devfn);
4632                         iommu_detach_dependent_devices(iommu, dev);
4633                         free_devinfo_mem(info);
4634
4635                         spin_lock_irqsave(&device_domain_lock, flags);
4636
4637                         if (found)
4638                                 break;
4639                         else
4640                                 continue;
4641                 }
4642
4643                 /*
4644                  * If there is no other devices under the same iommu owned by
4645                  * this domain, clear this iommu in iommu_refcnt update iommu
4646                  * count and coherency.
4647                  */
4648                 if (info->iommu == iommu)
4649                         found = true;
4650         }
4651
4652         spin_unlock_irqrestore(&device_domain_lock, flags);
4653
4654         if (found == 0) {
4655                 domain_detach_iommu(domain, iommu);
4656                 if (!domain_type_is_vm_or_si(domain))
4657                         iommu_detach_domain(domain, iommu);
4658         }
4659 }
4660
4661 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4662 {
4663         int adjust_width;
4664
4665         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4666                         DMA_32BIT_PFN);
4667         domain_reserve_special_ranges(domain);
4668
4669         /* calculate AGAW */
4670         domain->gaw = guest_width;
4671         adjust_width = guestwidth_to_adjustwidth(guest_width);
4672         domain->agaw = width_to_agaw(adjust_width);
4673
4674         domain->iommu_coherency = 0;
4675         domain->iommu_snooping = 0;
4676         domain->iommu_superpage = 0;
4677         domain->max_addr = 0;
4678
4679         /* always allocate the top pgd */
4680         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4681         if (!domain->pgd)
4682                 return -ENOMEM;
4683         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4684         return 0;
4685 }
4686
4687 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4688 {
4689         struct dmar_domain *dmar_domain;
4690         struct iommu_domain *domain;
4691
4692         if (type != IOMMU_DOMAIN_UNMANAGED)
4693                 return NULL;
4694
4695         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4696         if (!dmar_domain) {
4697                 pr_err("Can't allocate dmar_domain\n");
4698                 return NULL;
4699         }
4700         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4701                 pr_err("Domain initialization failed\n");
4702                 domain_exit(dmar_domain);
4703                 return NULL;
4704         }
4705         domain_update_iommu_cap(dmar_domain);
4706
4707         domain = &dmar_domain->domain;
4708         domain->geometry.aperture_start = 0;
4709         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4710         domain->geometry.force_aperture = true;
4711
4712         return domain;
4713 }
4714
4715 static void intel_iommu_domain_free(struct iommu_domain *domain)
4716 {
4717         domain_exit(to_dmar_domain(domain));
4718 }
4719
4720 static int intel_iommu_attach_device(struct iommu_domain *domain,
4721                                      struct device *dev)
4722 {
4723         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4724         struct intel_iommu *iommu;
4725         int addr_width;
4726         u8 bus, devfn;
4727
4728         if (device_is_rmrr_locked(dev)) {
4729                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4730                 return -EPERM;
4731         }
4732
4733         /* normally dev is not mapped */
4734         if (unlikely(domain_context_mapped(dev))) {
4735                 struct dmar_domain *old_domain;
4736
4737                 old_domain = find_domain(dev);
4738                 if (old_domain) {
4739                         if (domain_type_is_vm_or_si(dmar_domain))
4740                                 domain_remove_one_dev_info(old_domain, dev);
4741                         else
4742                                 domain_remove_dev_info(old_domain);
4743
4744                         if (!domain_type_is_vm_or_si(old_domain) &&
4745                              list_empty(&old_domain->devices))
4746                                 domain_exit(old_domain);
4747                 }
4748         }
4749
4750         iommu = device_to_iommu(dev, &bus, &devfn);
4751         if (!iommu)
4752                 return -ENODEV;
4753
4754         /* check if this iommu agaw is sufficient for max mapped address */
4755         addr_width = agaw_to_width(iommu->agaw);
4756         if (addr_width > cap_mgaw(iommu->cap))
4757                 addr_width = cap_mgaw(iommu->cap);
4758
4759         if (dmar_domain->max_addr > (1LL << addr_width)) {
4760                 pr_err("%s: iommu width (%d) is not "
4761                        "sufficient for the mapped address (%llx)\n",
4762                        __func__, addr_width, dmar_domain->max_addr);
4763                 return -EFAULT;
4764         }
4765         dmar_domain->gaw = addr_width;
4766
4767         /*
4768          * Knock out extra levels of page tables if necessary
4769          */
4770         while (iommu->agaw < dmar_domain->agaw) {
4771                 struct dma_pte *pte;
4772
4773                 pte = dmar_domain->pgd;
4774                 if (dma_pte_present(pte)) {
4775                         dmar_domain->pgd = (struct dma_pte *)
4776                                 phys_to_virt(dma_pte_addr(pte));
4777                         free_pgtable_page(pte);
4778                 }
4779                 dmar_domain->agaw--;
4780         }
4781
4782         return domain_add_dev_info(dmar_domain, dev);
4783 }
4784
4785 static void intel_iommu_detach_device(struct iommu_domain *domain,
4786                                       struct device *dev)
4787 {
4788         domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4789 }
4790
4791 static int intel_iommu_map(struct iommu_domain *domain,
4792                            unsigned long iova, phys_addr_t hpa,
4793                            size_t size, int iommu_prot)
4794 {
4795         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4796         u64 max_addr;
4797         int prot = 0;
4798         int ret;
4799
4800         if (iommu_prot & IOMMU_READ)
4801                 prot |= DMA_PTE_READ;
4802         if (iommu_prot & IOMMU_WRITE)
4803                 prot |= DMA_PTE_WRITE;
4804         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4805                 prot |= DMA_PTE_SNP;
4806
4807         max_addr = iova + size;
4808         if (dmar_domain->max_addr < max_addr) {
4809                 u64 end;
4810
4811                 /* check if minimum agaw is sufficient for mapped address */
4812                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4813                 if (end < max_addr) {
4814                         pr_err("%s: iommu width (%d) is not "
4815                                "sufficient for the mapped address (%llx)\n",
4816                                __func__, dmar_domain->gaw, max_addr);
4817                         return -EFAULT;
4818                 }
4819                 dmar_domain->max_addr = max_addr;
4820         }
4821         /* Round up size to next multiple of PAGE_SIZE, if it and
4822            the low bits of hpa would take us onto the next page */
4823         size = aligned_nrpages(hpa, size);
4824         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4825                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4826         return ret;
4827 }
4828
4829 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4830                                 unsigned long iova, size_t size)
4831 {
4832         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4833         struct page *freelist = NULL;
4834         struct intel_iommu *iommu;
4835         unsigned long start_pfn, last_pfn;
4836         unsigned int npages;
4837         int iommu_id, num, ndomains, level = 0;
4838
4839         /* Cope with horrid API which requires us to unmap more than the
4840            size argument if it happens to be a large-page mapping. */
4841         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4842                 BUG();
4843
4844         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4845                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4846
4847         start_pfn = iova >> VTD_PAGE_SHIFT;
4848         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4849
4850         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4851
4852         npages = last_pfn - start_pfn + 1;
4853
4854         for_each_domain_iommu(iommu_id, dmar_domain) {
4855                 iommu = g_iommus[iommu_id];
4856
4857                 /*
4858                  * find bit position of dmar_domain
4859                  */
4860                 ndomains = cap_ndoms(iommu->cap);
4861                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4862                         if (get_iommu_domain(iommu, num) == dmar_domain)
4863                                 iommu_flush_iotlb_psi(iommu, dmar_domain,
4864                                                       start_pfn, npages,
4865                                                       !freelist, 0);
4866                 }
4867
4868         }
4869
4870         dma_free_pagelist(freelist);
4871
4872         if (dmar_domain->max_addr == iova + size)
4873                 dmar_domain->max_addr = iova;
4874
4875         return size;
4876 }
4877
4878 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4879                                             dma_addr_t iova)
4880 {
4881         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4882         struct dma_pte *pte;
4883         int level = 0;
4884         u64 phys = 0;
4885
4886         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4887         if (pte)
4888                 phys = dma_pte_addr(pte);
4889
4890         return phys;
4891 }
4892
4893 static bool intel_iommu_capable(enum iommu_cap cap)
4894 {
4895         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4896                 return domain_update_iommu_snooping(NULL) == 1;
4897         if (cap == IOMMU_CAP_INTR_REMAP)
4898                 return irq_remapping_enabled == 1;
4899
4900         return false;
4901 }
4902
4903 static int intel_iommu_add_device(struct device *dev)
4904 {
4905         struct intel_iommu *iommu;
4906         struct iommu_group *group;
4907         u8 bus, devfn;
4908
4909         iommu = device_to_iommu(dev, &bus, &devfn);
4910         if (!iommu)
4911                 return -ENODEV;
4912
4913         iommu_device_link(iommu->iommu_dev, dev);
4914
4915         group = iommu_group_get_for_dev(dev);
4916
4917         if (IS_ERR(group))
4918                 return PTR_ERR(group);
4919
4920         iommu_group_put(group);
4921         return 0;
4922 }
4923
4924 static void intel_iommu_remove_device(struct device *dev)
4925 {
4926         struct intel_iommu *iommu;
4927         u8 bus, devfn;
4928
4929         iommu = device_to_iommu(dev, &bus, &devfn);
4930         if (!iommu)
4931                 return;
4932
4933         iommu_group_remove_device(dev);
4934
4935         iommu_device_unlink(iommu->iommu_dev, dev);
4936 }
4937
4938 static const struct iommu_ops intel_iommu_ops = {
4939         .capable        = intel_iommu_capable,
4940         .domain_alloc   = intel_iommu_domain_alloc,
4941         .domain_free    = intel_iommu_domain_free,
4942         .attach_dev     = intel_iommu_attach_device,
4943         .detach_dev     = intel_iommu_detach_device,
4944         .map            = intel_iommu_map,
4945         .unmap          = intel_iommu_unmap,
4946         .map_sg         = default_iommu_map_sg,
4947         .iova_to_phys   = intel_iommu_iova_to_phys,
4948         .add_device     = intel_iommu_add_device,
4949         .remove_device  = intel_iommu_remove_device,
4950         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4951 };
4952
4953 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4954 {
4955         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4956         pr_info("Disabling IOMMU for graphics on this chipset\n");
4957         dmar_map_gfx = 0;
4958 }
4959
4960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4967
4968 static void quirk_iommu_rwbf(struct pci_dev *dev)
4969 {
4970         /*
4971          * Mobile 4 Series Chipset neglects to set RWBF capability,
4972          * but needs it. Same seems to hold for the desktop versions.
4973          */
4974         pr_info("Forcing write-buffer flush capability\n");
4975         rwbf_quirk = 1;
4976 }
4977
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4984 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4985
4986 #define GGC 0x52
4987 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4988 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4989 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4990 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4991 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4992 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4993 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4994 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4995
4996 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4997 {
4998         unsigned short ggc;
4999
5000         if (pci_read_config_word(dev, GGC, &ggc))
5001                 return;
5002
5003         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5004                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5005                 dmar_map_gfx = 0;
5006         } else if (dmar_map_gfx) {
5007                 /* we have to ensure the gfx device is idle before we flush */
5008                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5009                 intel_iommu_strict = 1;
5010        }
5011 }
5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5015 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5016
5017 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5018    ISOCH DMAR unit for the Azalia sound device, but not give it any
5019    TLB entries, which causes it to deadlock. Check for that.  We do
5020    this in a function called from init_dmars(), instead of in a PCI
5021    quirk, because we don't want to print the obnoxious "BIOS broken"
5022    message if VT-d is actually disabled.
5023 */
5024 static void __init check_tylersburg_isoch(void)
5025 {
5026         struct pci_dev *pdev;
5027         uint32_t vtisochctrl;
5028
5029         /* If there's no Azalia in the system anyway, forget it. */
5030         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5031         if (!pdev)
5032                 return;
5033         pci_dev_put(pdev);
5034
5035         /* System Management Registers. Might be hidden, in which case
5036            we can't do the sanity check. But that's OK, because the
5037            known-broken BIOSes _don't_ actually hide it, so far. */
5038         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5039         if (!pdev)
5040                 return;
5041
5042         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5043                 pci_dev_put(pdev);
5044                 return;
5045         }
5046
5047         pci_dev_put(pdev);
5048
5049         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5050         if (vtisochctrl & 1)
5051                 return;
5052
5053         /* Drop all bits other than the number of TLB entries */
5054         vtisochctrl &= 0x1c;
5055
5056         /* If we have the recommended number of TLB entries (16), fine. */
5057         if (vtisochctrl == 0x10)
5058                 return;
5059
5060         /* Zero TLB entries? You get to ride the short bus to school. */
5061         if (!vtisochctrl) {
5062                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5063                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5064                      dmi_get_system_info(DMI_BIOS_VENDOR),
5065                      dmi_get_system_info(DMI_BIOS_VERSION),
5066                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5067                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5068                 return;
5069         }
5070
5071         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5072                vtisochctrl);
5073 }