iommu/vt-d: Don't pre-allocate domain ids for si_domain
[firefly-linux-kernel-4.4.55.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
50
51 #include "irq_remapping.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
85
86 /* page table handling */
87 #define LEVEL_STRIDE            (9)
88 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
89
90 /*
91  * This bitmap is used to advertise the page sizes our hardware support
92  * to the IOMMU core, which will then use this information to split
93  * physically contiguous memory regions it is mapping into page sizes
94  * that we support.
95  *
96  * Traditionally the IOMMU core just handed us the mappings directly,
97  * after making sure the size is an order of a 4KiB page and that the
98  * mapping has natural alignment.
99  *
100  * To retain this behavior, we currently advertise that we support
101  * all page sizes that are an order of 4KiB.
102  *
103  * If at some point we'd like to utilize the IOMMU core's new behavior,
104  * we could change this to advertise the real page sizes we support.
105  */
106 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
107
108 static inline int agaw_to_level(int agaw)
109 {
110         return agaw + 2;
111 }
112
113 static inline int agaw_to_width(int agaw)
114 {
115         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 }
117
118 static inline int width_to_agaw(int width)
119 {
120         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 }
122
123 static inline unsigned int level_to_offset_bits(int level)
124 {
125         return (level - 1) * LEVEL_STRIDE;
126 }
127
128 static inline int pfn_level_offset(unsigned long pfn, int level)
129 {
130         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 }
132
133 static inline unsigned long level_mask(int level)
134 {
135         return -1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long level_size(int level)
139 {
140         return 1UL << level_to_offset_bits(level);
141 }
142
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
144 {
145         return (pfn + level_size(level) - 1) & level_mask(level);
146 }
147
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
149 {
150         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 }
152
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154    are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
156 {
157         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
161 {
162         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
163 }
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
165 {
166         return mm_to_dma_pfn(page_to_pfn(pg));
167 }
168 static inline unsigned long virt_to_dma_pfn(void *p)
169 {
170         return page_to_dma_pfn(virt_to_page(p));
171 }
172
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
175
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
178
179 /*
180  * set to 1 to panic kernel if can't successfully enable VT-d
181  * (used when kernel is launched w/ TXT)
182  */
183 static int force_on = 0;
184
185 /*
186  * 0: Present
187  * 1-11: Reserved
188  * 12-63: Context Ptr (12 - (haw-1))
189  * 64-127: Reserved
190  */
191 struct root_entry {
192         u64     lo;
193         u64     hi;
194 };
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
196
197 /*
198  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 {
203         if (!(re->lo & 1))
204                 return 0;
205
206         return re->lo & VTD_PAGE_MASK;
207 }
208
209 /*
210  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
211  * if marked present.
212  */
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 {
215         if (!(re->hi & 1))
216                 return 0;
217
218         return re->hi & VTD_PAGE_MASK;
219 }
220 /*
221  * low 64 bits:
222  * 0: present
223  * 1: fault processing disable
224  * 2-3: translation type
225  * 12-63: address space root
226  * high 64 bits:
227  * 0-2: address width
228  * 3-6: aval
229  * 8-23: domain id
230  */
231 struct context_entry {
232         u64 lo;
233         u64 hi;
234 };
235
236 static inline void context_clear_pasid_enable(struct context_entry *context)
237 {
238         context->lo &= ~(1ULL << 11);
239 }
240
241 static inline bool context_pasid_enabled(struct context_entry *context)
242 {
243         return !!(context->lo & (1ULL << 11));
244 }
245
246 static inline void context_set_copied(struct context_entry *context)
247 {
248         context->hi |= (1ull << 3);
249 }
250
251 static inline bool context_copied(struct context_entry *context)
252 {
253         return !!(context->hi & (1ULL << 3));
254 }
255
256 static inline bool __context_present(struct context_entry *context)
257 {
258         return (context->lo & 1);
259 }
260
261 static inline bool context_present(struct context_entry *context)
262 {
263         return context_pasid_enabled(context) ?
264              __context_present(context) :
265              __context_present(context) && !context_copied(context);
266 }
267
268 static inline void context_set_present(struct context_entry *context)
269 {
270         context->lo |= 1;
271 }
272
273 static inline void context_set_fault_enable(struct context_entry *context)
274 {
275         context->lo &= (((u64)-1) << 2) | 1;
276 }
277
278 static inline void context_set_translation_type(struct context_entry *context,
279                                                 unsigned long value)
280 {
281         context->lo &= (((u64)-1) << 4) | 3;
282         context->lo |= (value & 3) << 2;
283 }
284
285 static inline void context_set_address_root(struct context_entry *context,
286                                             unsigned long value)
287 {
288         context->lo &= ~VTD_PAGE_MASK;
289         context->lo |= value & VTD_PAGE_MASK;
290 }
291
292 static inline void context_set_address_width(struct context_entry *context,
293                                              unsigned long value)
294 {
295         context->hi |= value & 7;
296 }
297
298 static inline void context_set_domain_id(struct context_entry *context,
299                                          unsigned long value)
300 {
301         context->hi |= (value & ((1 << 16) - 1)) << 8;
302 }
303
304 static inline int context_domain_id(struct context_entry *c)
305 {
306         return((c->hi >> 8) & 0xffff);
307 }
308
309 static inline void context_clear_entry(struct context_entry *context)
310 {
311         context->lo = 0;
312         context->hi = 0;
313 }
314
315 /*
316  * 0: readable
317  * 1: writable
318  * 2-6: reserved
319  * 7: super page
320  * 8-10: available
321  * 11: snoop behavior
322  * 12-63: Host physcial address
323  */
324 struct dma_pte {
325         u64 val;
326 };
327
328 static inline void dma_clear_pte(struct dma_pte *pte)
329 {
330         pte->val = 0;
331 }
332
333 static inline u64 dma_pte_addr(struct dma_pte *pte)
334 {
335 #ifdef CONFIG_64BIT
336         return pte->val & VTD_PAGE_MASK;
337 #else
338         /* Must have a full atomic 64-bit read */
339         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
340 #endif
341 }
342
343 static inline bool dma_pte_present(struct dma_pte *pte)
344 {
345         return (pte->val & 3) != 0;
346 }
347
348 static inline bool dma_pte_superpage(struct dma_pte *pte)
349 {
350         return (pte->val & DMA_PTE_LARGE_PAGE);
351 }
352
353 static inline int first_pte_in_page(struct dma_pte *pte)
354 {
355         return !((unsigned long)pte & ~VTD_PAGE_MASK);
356 }
357
358 /*
359  * This domain is a statically identity mapping domain.
360  *      1. This domain creats a static 1:1 mapping to all usable memory.
361  *      2. It maps to each iommu if successful.
362  *      3. Each iommu mapps to this domain if successful.
363  */
364 static struct dmar_domain *si_domain;
365 static int hw_pass_through = 1;
366
367 /*
368  * Domain represents a virtual machine, more than one devices
369  * across iommus may be owned in one domain, e.g. kvm guest.
370  */
371 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
372
373 /* si_domain contains mulitple devices */
374 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
375
376 struct dmar_domain {
377         int     id;                     /* domain id */
378         int     nid;                    /* node id */
379         DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
380                                         /* bitmap of iommus this domain uses*/
381
382         u16             iommu_did[DMAR_UNITS_SUPPORTED];
383                                         /* Domain ids per IOMMU. Use u16 since
384                                          * domain ids are 16 bit wide according
385                                          * to VT-d spec, section 9.3 */
386
387         struct list_head devices;       /* all devices' list */
388         struct iova_domain iovad;       /* iova's that belong to this domain */
389
390         struct dma_pte  *pgd;           /* virtual address */
391         int             gaw;            /* max guest address width */
392
393         /* adjusted guest address width, 0 is level 2 30-bit */
394         int             agaw;
395
396         int             flags;          /* flags to find out type of domain */
397
398         int             iommu_coherency;/* indicate coherency of iommu access */
399         int             iommu_snooping; /* indicate snooping control feature*/
400         int             iommu_count;    /* reference count of iommu */
401         int             iommu_superpage;/* Level of superpages supported:
402                                            0 == 4KiB (no superpages), 1 == 2MiB,
403                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
404         spinlock_t      iommu_lock;     /* protect iommu set in domain */
405         u64             max_addr;       /* maximum mapped address */
406
407         struct iommu_domain domain;     /* generic domain data structure for
408                                            iommu core */
409 };
410
411 /* PCI domain-device relationship */
412 struct device_domain_info {
413         struct list_head link;  /* link to domain siblings */
414         struct list_head global; /* link to global list */
415         u8 bus;                 /* PCI bus number */
416         u8 devfn;               /* PCI devfn number */
417         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
418         struct intel_iommu *iommu; /* IOMMU used by this device */
419         struct dmar_domain *domain; /* pointer to domain */
420 };
421
422 struct dmar_rmrr_unit {
423         struct list_head list;          /* list of rmrr units   */
424         struct acpi_dmar_header *hdr;   /* ACPI header          */
425         u64     base_address;           /* reserved base address*/
426         u64     end_address;            /* reserved end address */
427         struct dmar_dev_scope *devices; /* target devices */
428         int     devices_cnt;            /* target device count */
429 };
430
431 struct dmar_atsr_unit {
432         struct list_head list;          /* list of ATSR units */
433         struct acpi_dmar_header *hdr;   /* ACPI header */
434         struct dmar_dev_scope *devices; /* target devices */
435         int devices_cnt;                /* target device count */
436         u8 include_all:1;               /* include all ports */
437 };
438
439 static LIST_HEAD(dmar_atsr_units);
440 static LIST_HEAD(dmar_rmrr_units);
441
442 #define for_each_rmrr_units(rmrr) \
443         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
444
445 static void flush_unmaps_timeout(unsigned long data);
446
447 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
448
449 #define HIGH_WATER_MARK 250
450 struct deferred_flush_tables {
451         int next;
452         struct iova *iova[HIGH_WATER_MARK];
453         struct dmar_domain *domain[HIGH_WATER_MARK];
454         struct page *freelist[HIGH_WATER_MARK];
455 };
456
457 static struct deferred_flush_tables *deferred_flush;
458
459 /* bitmap for indexing intel_iommus */
460 static int g_num_of_iommus;
461
462 static DEFINE_SPINLOCK(async_umap_flush_lock);
463 static LIST_HEAD(unmaps_to_do);
464
465 static int timer_on;
466 static long list_size;
467
468 static void domain_exit(struct dmar_domain *domain);
469 static void domain_remove_dev_info(struct dmar_domain *domain);
470 static void domain_remove_one_dev_info(struct dmar_domain *domain,
471                                        struct device *dev);
472 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
473                                            struct device *dev);
474 static int domain_detach_iommu(struct dmar_domain *domain,
475                                struct intel_iommu *iommu);
476
477 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
478 int dmar_disabled = 0;
479 #else
480 int dmar_disabled = 1;
481 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
482
483 int intel_iommu_enabled = 0;
484 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
485
486 static int dmar_map_gfx = 1;
487 static int dmar_forcedac;
488 static int intel_iommu_strict;
489 static int intel_iommu_superpage = 1;
490 static int intel_iommu_ecs = 1;
491
492 /* We only actually use ECS when PASID support (on the new bit 40)
493  * is also advertised. Some early implementations — the ones with
494  * PASID support on bit 28 — have issues even when we *only* use
495  * extended root/context tables. */
496 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
497                             ecap_pasid(iommu->ecap))
498
499 int intel_iommu_gfx_mapped;
500 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
501
502 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
503 static DEFINE_SPINLOCK(device_domain_lock);
504 static LIST_HEAD(device_domain_list);
505
506 static const struct iommu_ops intel_iommu_ops;
507
508 static bool translation_pre_enabled(struct intel_iommu *iommu)
509 {
510         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
511 }
512
513 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
514 {
515         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
516 }
517
518 static void init_translation_status(struct intel_iommu *iommu)
519 {
520         u32 gsts;
521
522         gsts = readl(iommu->reg + DMAR_GSTS_REG);
523         if (gsts & DMA_GSTS_TES)
524                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
525 }
526
527 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
528 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
529 {
530         return container_of(dom, struct dmar_domain, domain);
531 }
532
533 static int __init intel_iommu_setup(char *str)
534 {
535         if (!str)
536                 return -EINVAL;
537         while (*str) {
538                 if (!strncmp(str, "on", 2)) {
539                         dmar_disabled = 0;
540                         pr_info("IOMMU enabled\n");
541                 } else if (!strncmp(str, "off", 3)) {
542                         dmar_disabled = 1;
543                         pr_info("IOMMU disabled\n");
544                 } else if (!strncmp(str, "igfx_off", 8)) {
545                         dmar_map_gfx = 0;
546                         pr_info("Disable GFX device mapping\n");
547                 } else if (!strncmp(str, "forcedac", 8)) {
548                         pr_info("Forcing DAC for PCI devices\n");
549                         dmar_forcedac = 1;
550                 } else if (!strncmp(str, "strict", 6)) {
551                         pr_info("Disable batched IOTLB flush\n");
552                         intel_iommu_strict = 1;
553                 } else if (!strncmp(str, "sp_off", 6)) {
554                         pr_info("Disable supported super page\n");
555                         intel_iommu_superpage = 0;
556                 } else if (!strncmp(str, "ecs_off", 7)) {
557                         printk(KERN_INFO
558                                 "Intel-IOMMU: disable extended context table support\n");
559                         intel_iommu_ecs = 0;
560                 }
561
562                 str += strcspn(str, ",");
563                 while (*str == ',')
564                         str++;
565         }
566         return 0;
567 }
568 __setup("intel_iommu=", intel_iommu_setup);
569
570 static struct kmem_cache *iommu_domain_cache;
571 static struct kmem_cache *iommu_devinfo_cache;
572
573 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
574 {
575         struct dmar_domain **domains;
576         int idx = did >> 8;
577
578         domains = iommu->domains[idx];
579         if (!domains)
580                 return NULL;
581
582         return domains[did & 0xff];
583 }
584
585 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
586                              struct dmar_domain *domain)
587 {
588         struct dmar_domain **domains;
589         int idx = did >> 8;
590
591         if (!iommu->domains[idx]) {
592                 size_t size = 256 * sizeof(struct dmar_domain *);
593                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
594         }
595
596         domains = iommu->domains[idx];
597         if (WARN_ON(!domains))
598                 return;
599         else
600                 domains[did & 0xff] = domain;
601 }
602
603 static inline void *alloc_pgtable_page(int node)
604 {
605         struct page *page;
606         void *vaddr = NULL;
607
608         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
609         if (page)
610                 vaddr = page_address(page);
611         return vaddr;
612 }
613
614 static inline void free_pgtable_page(void *vaddr)
615 {
616         free_page((unsigned long)vaddr);
617 }
618
619 static inline void *alloc_domain_mem(void)
620 {
621         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
622 }
623
624 static void free_domain_mem(void *vaddr)
625 {
626         kmem_cache_free(iommu_domain_cache, vaddr);
627 }
628
629 static inline void * alloc_devinfo_mem(void)
630 {
631         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
632 }
633
634 static inline void free_devinfo_mem(void *vaddr)
635 {
636         kmem_cache_free(iommu_devinfo_cache, vaddr);
637 }
638
639 static inline int domain_type_is_vm(struct dmar_domain *domain)
640 {
641         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
642 }
643
644 static inline int domain_type_is_si(struct dmar_domain *domain)
645 {
646         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
647 }
648
649 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
650 {
651         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
652                                 DOMAIN_FLAG_STATIC_IDENTITY);
653 }
654
655 static inline int domain_pfn_supported(struct dmar_domain *domain,
656                                        unsigned long pfn)
657 {
658         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
659
660         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
661 }
662
663 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
664 {
665         unsigned long sagaw;
666         int agaw = -1;
667
668         sagaw = cap_sagaw(iommu->cap);
669         for (agaw = width_to_agaw(max_gaw);
670              agaw >= 0; agaw--) {
671                 if (test_bit(agaw, &sagaw))
672                         break;
673         }
674
675         return agaw;
676 }
677
678 /*
679  * Calculate max SAGAW for each iommu.
680  */
681 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
682 {
683         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
684 }
685
686 /*
687  * calculate agaw for each iommu.
688  * "SAGAW" may be different across iommus, use a default agaw, and
689  * get a supported less agaw for iommus that don't support the default agaw.
690  */
691 int iommu_calculate_agaw(struct intel_iommu *iommu)
692 {
693         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
694 }
695
696 /* This functionin only returns single iommu in a domain */
697 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
698 {
699         int iommu_id;
700
701         /* si_domain and vm domain should not get here. */
702         BUG_ON(domain_type_is_vm_or_si(domain));
703         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
704         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
705                 return NULL;
706
707         return g_iommus[iommu_id];
708 }
709
710 static void domain_update_iommu_coherency(struct dmar_domain *domain)
711 {
712         struct dmar_drhd_unit *drhd;
713         struct intel_iommu *iommu;
714         bool found = false;
715         int i;
716
717         domain->iommu_coherency = 1;
718
719         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
720                 found = true;
721                 if (!ecap_coherent(g_iommus[i]->ecap)) {
722                         domain->iommu_coherency = 0;
723                         break;
724                 }
725         }
726         if (found)
727                 return;
728
729         /* No hardware attached; use lowest common denominator */
730         rcu_read_lock();
731         for_each_active_iommu(iommu, drhd) {
732                 if (!ecap_coherent(iommu->ecap)) {
733                         domain->iommu_coherency = 0;
734                         break;
735                 }
736         }
737         rcu_read_unlock();
738 }
739
740 static int domain_update_iommu_snooping(struct intel_iommu *skip)
741 {
742         struct dmar_drhd_unit *drhd;
743         struct intel_iommu *iommu;
744         int ret = 1;
745
746         rcu_read_lock();
747         for_each_active_iommu(iommu, drhd) {
748                 if (iommu != skip) {
749                         if (!ecap_sc_support(iommu->ecap)) {
750                                 ret = 0;
751                                 break;
752                         }
753                 }
754         }
755         rcu_read_unlock();
756
757         return ret;
758 }
759
760 static int domain_update_iommu_superpage(struct intel_iommu *skip)
761 {
762         struct dmar_drhd_unit *drhd;
763         struct intel_iommu *iommu;
764         int mask = 0xf;
765
766         if (!intel_iommu_superpage) {
767                 return 0;
768         }
769
770         /* set iommu_superpage to the smallest common denominator */
771         rcu_read_lock();
772         for_each_active_iommu(iommu, drhd) {
773                 if (iommu != skip) {
774                         mask &= cap_super_page_val(iommu->cap);
775                         if (!mask)
776                                 break;
777                 }
778         }
779         rcu_read_unlock();
780
781         return fls(mask);
782 }
783
784 /* Some capabilities may be different across iommus */
785 static void domain_update_iommu_cap(struct dmar_domain *domain)
786 {
787         domain_update_iommu_coherency(domain);
788         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
789         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
790 }
791
792 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
793                                                        u8 bus, u8 devfn, int alloc)
794 {
795         struct root_entry *root = &iommu->root_entry[bus];
796         struct context_entry *context;
797         u64 *entry;
798
799         if (ecs_enabled(iommu)) {
800                 if (devfn >= 0x80) {
801                         devfn -= 0x80;
802                         entry = &root->hi;
803                 }
804                 devfn *= 2;
805         }
806         entry = &root->lo;
807         if (*entry & 1)
808                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
809         else {
810                 unsigned long phy_addr;
811                 if (!alloc)
812                         return NULL;
813
814                 context = alloc_pgtable_page(iommu->node);
815                 if (!context)
816                         return NULL;
817
818                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
819                 phy_addr = virt_to_phys((void *)context);
820                 *entry = phy_addr | 1;
821                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
822         }
823         return &context[devfn];
824 }
825
826 static int iommu_dummy(struct device *dev)
827 {
828         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
829 }
830
831 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
832 {
833         struct dmar_drhd_unit *drhd = NULL;
834         struct intel_iommu *iommu;
835         struct device *tmp;
836         struct pci_dev *ptmp, *pdev = NULL;
837         u16 segment = 0;
838         int i;
839
840         if (iommu_dummy(dev))
841                 return NULL;
842
843         if (dev_is_pci(dev)) {
844                 pdev = to_pci_dev(dev);
845                 segment = pci_domain_nr(pdev->bus);
846         } else if (has_acpi_companion(dev))
847                 dev = &ACPI_COMPANION(dev)->dev;
848
849         rcu_read_lock();
850         for_each_active_iommu(iommu, drhd) {
851                 if (pdev && segment != drhd->segment)
852                         continue;
853
854                 for_each_active_dev_scope(drhd->devices,
855                                           drhd->devices_cnt, i, tmp) {
856                         if (tmp == dev) {
857                                 *bus = drhd->devices[i].bus;
858                                 *devfn = drhd->devices[i].devfn;
859                                 goto out;
860                         }
861
862                         if (!pdev || !dev_is_pci(tmp))
863                                 continue;
864
865                         ptmp = to_pci_dev(tmp);
866                         if (ptmp->subordinate &&
867                             ptmp->subordinate->number <= pdev->bus->number &&
868                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
869                                 goto got_pdev;
870                 }
871
872                 if (pdev && drhd->include_all) {
873                 got_pdev:
874                         *bus = pdev->bus->number;
875                         *devfn = pdev->devfn;
876                         goto out;
877                 }
878         }
879         iommu = NULL;
880  out:
881         rcu_read_unlock();
882
883         return iommu;
884 }
885
886 static void domain_flush_cache(struct dmar_domain *domain,
887                                void *addr, int size)
888 {
889         if (!domain->iommu_coherency)
890                 clflush_cache_range(addr, size);
891 }
892
893 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
894 {
895         struct context_entry *context;
896         int ret = 0;
897         unsigned long flags;
898
899         spin_lock_irqsave(&iommu->lock, flags);
900         context = iommu_context_addr(iommu, bus, devfn, 0);
901         if (context)
902                 ret = context_present(context);
903         spin_unlock_irqrestore(&iommu->lock, flags);
904         return ret;
905 }
906
907 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
908 {
909         struct context_entry *context;
910         unsigned long flags;
911
912         spin_lock_irqsave(&iommu->lock, flags);
913         context = iommu_context_addr(iommu, bus, devfn, 0);
914         if (context) {
915                 context_clear_entry(context);
916                 __iommu_flush_cache(iommu, context, sizeof(*context));
917         }
918         spin_unlock_irqrestore(&iommu->lock, flags);
919 }
920
921 static void free_context_table(struct intel_iommu *iommu)
922 {
923         int i;
924         unsigned long flags;
925         struct context_entry *context;
926
927         spin_lock_irqsave(&iommu->lock, flags);
928         if (!iommu->root_entry) {
929                 goto out;
930         }
931         for (i = 0; i < ROOT_ENTRY_NR; i++) {
932                 context = iommu_context_addr(iommu, i, 0, 0);
933                 if (context)
934                         free_pgtable_page(context);
935
936                 if (!ecs_enabled(iommu))
937                         continue;
938
939                 context = iommu_context_addr(iommu, i, 0x80, 0);
940                 if (context)
941                         free_pgtable_page(context);
942
943         }
944         free_pgtable_page(iommu->root_entry);
945         iommu->root_entry = NULL;
946 out:
947         spin_unlock_irqrestore(&iommu->lock, flags);
948 }
949
950 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
951                                       unsigned long pfn, int *target_level)
952 {
953         struct dma_pte *parent, *pte = NULL;
954         int level = agaw_to_level(domain->agaw);
955         int offset;
956
957         BUG_ON(!domain->pgd);
958
959         if (!domain_pfn_supported(domain, pfn))
960                 /* Address beyond IOMMU's addressing capabilities. */
961                 return NULL;
962
963         parent = domain->pgd;
964
965         while (1) {
966                 void *tmp_page;
967
968                 offset = pfn_level_offset(pfn, level);
969                 pte = &parent[offset];
970                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
971                         break;
972                 if (level == *target_level)
973                         break;
974
975                 if (!dma_pte_present(pte)) {
976                         uint64_t pteval;
977
978                         tmp_page = alloc_pgtable_page(domain->nid);
979
980                         if (!tmp_page)
981                                 return NULL;
982
983                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
984                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
985                         if (cmpxchg64(&pte->val, 0ULL, pteval))
986                                 /* Someone else set it while we were thinking; use theirs. */
987                                 free_pgtable_page(tmp_page);
988                         else
989                                 domain_flush_cache(domain, pte, sizeof(*pte));
990                 }
991                 if (level == 1)
992                         break;
993
994                 parent = phys_to_virt(dma_pte_addr(pte));
995                 level--;
996         }
997
998         if (!*target_level)
999                 *target_level = level;
1000
1001         return pte;
1002 }
1003
1004
1005 /* return address's pte at specific level */
1006 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1007                                          unsigned long pfn,
1008                                          int level, int *large_page)
1009 {
1010         struct dma_pte *parent, *pte = NULL;
1011         int total = agaw_to_level(domain->agaw);
1012         int offset;
1013
1014         parent = domain->pgd;
1015         while (level <= total) {
1016                 offset = pfn_level_offset(pfn, total);
1017                 pte = &parent[offset];
1018                 if (level == total)
1019                         return pte;
1020
1021                 if (!dma_pte_present(pte)) {
1022                         *large_page = total;
1023                         break;
1024                 }
1025
1026                 if (dma_pte_superpage(pte)) {
1027                         *large_page = total;
1028                         return pte;
1029                 }
1030
1031                 parent = phys_to_virt(dma_pte_addr(pte));
1032                 total--;
1033         }
1034         return NULL;
1035 }
1036
1037 /* clear last level pte, a tlb flush should be followed */
1038 static void dma_pte_clear_range(struct dmar_domain *domain,
1039                                 unsigned long start_pfn,
1040                                 unsigned long last_pfn)
1041 {
1042         unsigned int large_page = 1;
1043         struct dma_pte *first_pte, *pte;
1044
1045         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1046         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1047         BUG_ON(start_pfn > last_pfn);
1048
1049         /* we don't need lock here; nobody else touches the iova range */
1050         do {
1051                 large_page = 1;
1052                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1053                 if (!pte) {
1054                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1055                         continue;
1056                 }
1057                 do {
1058                         dma_clear_pte(pte);
1059                         start_pfn += lvl_to_nr_pages(large_page);
1060                         pte++;
1061                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1062
1063                 domain_flush_cache(domain, first_pte,
1064                                    (void *)pte - (void *)first_pte);
1065
1066         } while (start_pfn && start_pfn <= last_pfn);
1067 }
1068
1069 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1070                                struct dma_pte *pte, unsigned long pfn,
1071                                unsigned long start_pfn, unsigned long last_pfn)
1072 {
1073         pfn = max(start_pfn, pfn);
1074         pte = &pte[pfn_level_offset(pfn, level)];
1075
1076         do {
1077                 unsigned long level_pfn;
1078                 struct dma_pte *level_pte;
1079
1080                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1081                         goto next;
1082
1083                 level_pfn = pfn & level_mask(level - 1);
1084                 level_pte = phys_to_virt(dma_pte_addr(pte));
1085
1086                 if (level > 2)
1087                         dma_pte_free_level(domain, level - 1, level_pte,
1088                                            level_pfn, start_pfn, last_pfn);
1089
1090                 /* If range covers entire pagetable, free it */
1091                 if (!(start_pfn > level_pfn ||
1092                       last_pfn < level_pfn + level_size(level) - 1)) {
1093                         dma_clear_pte(pte);
1094                         domain_flush_cache(domain, pte, sizeof(*pte));
1095                         free_pgtable_page(level_pte);
1096                 }
1097 next:
1098                 pfn += level_size(level);
1099         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1100 }
1101
1102 /* free page table pages. last level pte should already be cleared */
1103 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1104                                    unsigned long start_pfn,
1105                                    unsigned long last_pfn)
1106 {
1107         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1108         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1109         BUG_ON(start_pfn > last_pfn);
1110
1111         dma_pte_clear_range(domain, start_pfn, last_pfn);
1112
1113         /* We don't need lock here; nobody else touches the iova range */
1114         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1115                            domain->pgd, 0, start_pfn, last_pfn);
1116
1117         /* free pgd */
1118         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1119                 free_pgtable_page(domain->pgd);
1120                 domain->pgd = NULL;
1121         }
1122 }
1123
1124 /* When a page at a given level is being unlinked from its parent, we don't
1125    need to *modify* it at all. All we need to do is make a list of all the
1126    pages which can be freed just as soon as we've flushed the IOTLB and we
1127    know the hardware page-walk will no longer touch them.
1128    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1129    be freed. */
1130 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1131                                             int level, struct dma_pte *pte,
1132                                             struct page *freelist)
1133 {
1134         struct page *pg;
1135
1136         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1137         pg->freelist = freelist;
1138         freelist = pg;
1139
1140         if (level == 1)
1141                 return freelist;
1142
1143         pte = page_address(pg);
1144         do {
1145                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1146                         freelist = dma_pte_list_pagetables(domain, level - 1,
1147                                                            pte, freelist);
1148                 pte++;
1149         } while (!first_pte_in_page(pte));
1150
1151         return freelist;
1152 }
1153
1154 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1155                                         struct dma_pte *pte, unsigned long pfn,
1156                                         unsigned long start_pfn,
1157                                         unsigned long last_pfn,
1158                                         struct page *freelist)
1159 {
1160         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1161
1162         pfn = max(start_pfn, pfn);
1163         pte = &pte[pfn_level_offset(pfn, level)];
1164
1165         do {
1166                 unsigned long level_pfn;
1167
1168                 if (!dma_pte_present(pte))
1169                         goto next;
1170
1171                 level_pfn = pfn & level_mask(level);
1172
1173                 /* If range covers entire pagetable, free it */
1174                 if (start_pfn <= level_pfn &&
1175                     last_pfn >= level_pfn + level_size(level) - 1) {
1176                         /* These suborbinate page tables are going away entirely. Don't
1177                            bother to clear them; we're just going to *free* them. */
1178                         if (level > 1 && !dma_pte_superpage(pte))
1179                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1180
1181                         dma_clear_pte(pte);
1182                         if (!first_pte)
1183                                 first_pte = pte;
1184                         last_pte = pte;
1185                 } else if (level > 1) {
1186                         /* Recurse down into a level that isn't *entirely* obsolete */
1187                         freelist = dma_pte_clear_level(domain, level - 1,
1188                                                        phys_to_virt(dma_pte_addr(pte)),
1189                                                        level_pfn, start_pfn, last_pfn,
1190                                                        freelist);
1191                 }
1192 next:
1193                 pfn += level_size(level);
1194         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1195
1196         if (first_pte)
1197                 domain_flush_cache(domain, first_pte,
1198                                    (void *)++last_pte - (void *)first_pte);
1199
1200         return freelist;
1201 }
1202
1203 /* We can't just free the pages because the IOMMU may still be walking
1204    the page tables, and may have cached the intermediate levels. The
1205    pages can only be freed after the IOTLB flush has been done. */
1206 struct page *domain_unmap(struct dmar_domain *domain,
1207                           unsigned long start_pfn,
1208                           unsigned long last_pfn)
1209 {
1210         struct page *freelist = NULL;
1211
1212         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1213         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1214         BUG_ON(start_pfn > last_pfn);
1215
1216         /* we don't need lock here; nobody else touches the iova range */
1217         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1218                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1219
1220         /* free pgd */
1221         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1222                 struct page *pgd_page = virt_to_page(domain->pgd);
1223                 pgd_page->freelist = freelist;
1224                 freelist = pgd_page;
1225
1226                 domain->pgd = NULL;
1227         }
1228
1229         return freelist;
1230 }
1231
1232 void dma_free_pagelist(struct page *freelist)
1233 {
1234         struct page *pg;
1235
1236         while ((pg = freelist)) {
1237                 freelist = pg->freelist;
1238                 free_pgtable_page(page_address(pg));
1239         }
1240 }
1241
1242 /* iommu handling */
1243 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1244 {
1245         struct root_entry *root;
1246         unsigned long flags;
1247
1248         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1249         if (!root) {
1250                 pr_err("Allocating root entry for %s failed\n",
1251                         iommu->name);
1252                 return -ENOMEM;
1253         }
1254
1255         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1256
1257         spin_lock_irqsave(&iommu->lock, flags);
1258         iommu->root_entry = root;
1259         spin_unlock_irqrestore(&iommu->lock, flags);
1260
1261         return 0;
1262 }
1263
1264 static void iommu_set_root_entry(struct intel_iommu *iommu)
1265 {
1266         u64 addr;
1267         u32 sts;
1268         unsigned long flag;
1269
1270         addr = virt_to_phys(iommu->root_entry);
1271         if (ecs_enabled(iommu))
1272                 addr |= DMA_RTADDR_RTT;
1273
1274         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1275         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1276
1277         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1278
1279         /* Make sure hardware complete it */
1280         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1281                       readl, (sts & DMA_GSTS_RTPS), sts);
1282
1283         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1284 }
1285
1286 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1287 {
1288         u32 val;
1289         unsigned long flag;
1290
1291         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1292                 return;
1293
1294         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1295         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1296
1297         /* Make sure hardware complete it */
1298         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1299                       readl, (!(val & DMA_GSTS_WBFS)), val);
1300
1301         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1302 }
1303
1304 /* return value determine if we need a write buffer flush */
1305 static void __iommu_flush_context(struct intel_iommu *iommu,
1306                                   u16 did, u16 source_id, u8 function_mask,
1307                                   u64 type)
1308 {
1309         u64 val = 0;
1310         unsigned long flag;
1311
1312         switch (type) {
1313         case DMA_CCMD_GLOBAL_INVL:
1314                 val = DMA_CCMD_GLOBAL_INVL;
1315                 break;
1316         case DMA_CCMD_DOMAIN_INVL:
1317                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1318                 break;
1319         case DMA_CCMD_DEVICE_INVL:
1320                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1321                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1322                 break;
1323         default:
1324                 BUG();
1325         }
1326         val |= DMA_CCMD_ICC;
1327
1328         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1329         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1330
1331         /* Make sure hardware complete it */
1332         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1333                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1334
1335         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1336 }
1337
1338 /* return value determine if we need a write buffer flush */
1339 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1340                                 u64 addr, unsigned int size_order, u64 type)
1341 {
1342         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1343         u64 val = 0, val_iva = 0;
1344         unsigned long flag;
1345
1346         switch (type) {
1347         case DMA_TLB_GLOBAL_FLUSH:
1348                 /* global flush doesn't need set IVA_REG */
1349                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1350                 break;
1351         case DMA_TLB_DSI_FLUSH:
1352                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1353                 break;
1354         case DMA_TLB_PSI_FLUSH:
1355                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1356                 /* IH bit is passed in as part of address */
1357                 val_iva = size_order | addr;
1358                 break;
1359         default:
1360                 BUG();
1361         }
1362         /* Note: set drain read/write */
1363 #if 0
1364         /*
1365          * This is probably to be super secure.. Looks like we can
1366          * ignore it without any impact.
1367          */
1368         if (cap_read_drain(iommu->cap))
1369                 val |= DMA_TLB_READ_DRAIN;
1370 #endif
1371         if (cap_write_drain(iommu->cap))
1372                 val |= DMA_TLB_WRITE_DRAIN;
1373
1374         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1375         /* Note: Only uses first TLB reg currently */
1376         if (val_iva)
1377                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1378         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1379
1380         /* Make sure hardware complete it */
1381         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1382                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1383
1384         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1385
1386         /* check IOTLB invalidation granularity */
1387         if (DMA_TLB_IAIG(val) == 0)
1388                 pr_err("Flush IOTLB failed\n");
1389         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1390                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1391                         (unsigned long long)DMA_TLB_IIRG(type),
1392                         (unsigned long long)DMA_TLB_IAIG(val));
1393 }
1394
1395 static struct device_domain_info *
1396 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1397                          u8 bus, u8 devfn)
1398 {
1399         bool found = false;
1400         unsigned long flags;
1401         struct device_domain_info *info;
1402         struct pci_dev *pdev;
1403
1404         if (!ecap_dev_iotlb_support(iommu->ecap))
1405                 return NULL;
1406
1407         if (!iommu->qi)
1408                 return NULL;
1409
1410         spin_lock_irqsave(&device_domain_lock, flags);
1411         list_for_each_entry(info, &domain->devices, link)
1412                 if (info->iommu == iommu && info->bus == bus &&
1413                     info->devfn == devfn) {
1414                         found = true;
1415                         break;
1416                 }
1417         spin_unlock_irqrestore(&device_domain_lock, flags);
1418
1419         if (!found || !info->dev || !dev_is_pci(info->dev))
1420                 return NULL;
1421
1422         pdev = to_pci_dev(info->dev);
1423
1424         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1425                 return NULL;
1426
1427         if (!dmar_find_matched_atsr_unit(pdev))
1428                 return NULL;
1429
1430         return info;
1431 }
1432
1433 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1434 {
1435         if (!info || !dev_is_pci(info->dev))
1436                 return;
1437
1438         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1439 }
1440
1441 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1442 {
1443         if (!info->dev || !dev_is_pci(info->dev) ||
1444             !pci_ats_enabled(to_pci_dev(info->dev)))
1445                 return;
1446
1447         pci_disable_ats(to_pci_dev(info->dev));
1448 }
1449
1450 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1451                                   u64 addr, unsigned mask)
1452 {
1453         u16 sid, qdep;
1454         unsigned long flags;
1455         struct device_domain_info *info;
1456
1457         spin_lock_irqsave(&device_domain_lock, flags);
1458         list_for_each_entry(info, &domain->devices, link) {
1459                 struct pci_dev *pdev;
1460                 if (!info->dev || !dev_is_pci(info->dev))
1461                         continue;
1462
1463                 pdev = to_pci_dev(info->dev);
1464                 if (!pci_ats_enabled(pdev))
1465                         continue;
1466
1467                 sid = info->bus << 8 | info->devfn;
1468                 qdep = pci_ats_queue_depth(pdev);
1469                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1470         }
1471         spin_unlock_irqrestore(&device_domain_lock, flags);
1472 }
1473
1474 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1475                                   struct dmar_domain *domain,
1476                                   unsigned long pfn, unsigned int pages,
1477                                   int ih, int map)
1478 {
1479         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1480         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1481         u16 did = domain->iommu_did[iommu->seq_id];
1482
1483         BUG_ON(pages == 0);
1484
1485         if (ih)
1486                 ih = 1 << 6;
1487         /*
1488          * Fallback to domain selective flush if no PSI support or the size is
1489          * too big.
1490          * PSI requires page size to be 2 ^ x, and the base address is naturally
1491          * aligned to the size
1492          */
1493         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1494                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1495                                                 DMA_TLB_DSI_FLUSH);
1496         else
1497                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1498                                                 DMA_TLB_PSI_FLUSH);
1499
1500         /*
1501          * In caching mode, changes of pages from non-present to present require
1502          * flush. However, device IOTLB doesn't need to be flushed in this case.
1503          */
1504         if (!cap_caching_mode(iommu->cap) || !map)
1505                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1506                                       addr, mask);
1507 }
1508
1509 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1510 {
1511         u32 pmen;
1512         unsigned long flags;
1513
1514         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1515         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1516         pmen &= ~DMA_PMEN_EPM;
1517         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1518
1519         /* wait for the protected region status bit to clear */
1520         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1521                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1522
1523         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1524 }
1525
1526 static void iommu_enable_translation(struct intel_iommu *iommu)
1527 {
1528         u32 sts;
1529         unsigned long flags;
1530
1531         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1532         iommu->gcmd |= DMA_GCMD_TE;
1533         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1534
1535         /* Make sure hardware complete it */
1536         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1537                       readl, (sts & DMA_GSTS_TES), sts);
1538
1539         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1540 }
1541
1542 static void iommu_disable_translation(struct intel_iommu *iommu)
1543 {
1544         u32 sts;
1545         unsigned long flag;
1546
1547         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1548         iommu->gcmd &= ~DMA_GCMD_TE;
1549         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1550
1551         /* Make sure hardware complete it */
1552         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1553                       readl, (!(sts & DMA_GSTS_TES)), sts);
1554
1555         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1556 }
1557
1558
1559 static int iommu_init_domains(struct intel_iommu *iommu)
1560 {
1561         u32 ndomains, nlongs;
1562         size_t size;
1563
1564         ndomains = cap_ndoms(iommu->cap);
1565         pr_debug("%s: Number of Domains supported <%d>\n",
1566                  iommu->name, ndomains);
1567         nlongs = BITS_TO_LONGS(ndomains);
1568
1569         spin_lock_init(&iommu->lock);
1570
1571         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1572         if (!iommu->domain_ids) {
1573                 pr_err("%s: Allocating domain id array failed\n",
1574                        iommu->name);
1575                 return -ENOMEM;
1576         }
1577
1578         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1579         iommu->domains = kzalloc(size, GFP_KERNEL);
1580
1581         if (iommu->domains) {
1582                 size = 256 * sizeof(struct dmar_domain *);
1583                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1584         }
1585
1586         if (!iommu->domains || !iommu->domains[0]) {
1587                 pr_err("%s: Allocating domain array failed\n",
1588                        iommu->name);
1589                 kfree(iommu->domain_ids);
1590                 kfree(iommu->domains);
1591                 iommu->domain_ids = NULL;
1592                 iommu->domains    = NULL;
1593                 return -ENOMEM;
1594         }
1595
1596
1597
1598         /*
1599          * If Caching mode is set, then invalid translations are tagged
1600          * with domain-id 0, hence we need to pre-allocate it. We also
1601          * use domain-id 0 as a marker for non-allocated domain-id, so
1602          * make sure it is not used for a real domain.
1603          */
1604         set_bit(0, iommu->domain_ids);
1605
1606         return 0;
1607 }
1608
1609 static void disable_dmar_iommu(struct intel_iommu *iommu)
1610 {
1611         struct dmar_domain *domain;
1612         int i;
1613
1614         if ((iommu->domains) && (iommu->domain_ids)) {
1615                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1616                         /*
1617                          * Domain id 0 is reserved for invalid translation
1618                          * if hardware supports caching mode and used as
1619                          * a non-allocated marker.
1620                          */
1621                         if (i == 0)
1622                                 continue;
1623
1624                         domain = get_iommu_domain(iommu, i);
1625                         clear_bit(i, iommu->domain_ids);
1626                         if (domain_detach_iommu(domain, iommu) == 0 &&
1627                             !domain_type_is_vm(domain))
1628                                 domain_exit(domain);
1629                 }
1630         }
1631
1632         if (iommu->gcmd & DMA_GCMD_TE)
1633                 iommu_disable_translation(iommu);
1634 }
1635
1636 static void free_dmar_iommu(struct intel_iommu *iommu)
1637 {
1638         if ((iommu->domains) && (iommu->domain_ids)) {
1639                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1640                 int i;
1641
1642                 for (i = 0; i < elems; i++)
1643                         kfree(iommu->domains[i]);
1644                 kfree(iommu->domains);
1645                 kfree(iommu->domain_ids);
1646                 iommu->domains = NULL;
1647                 iommu->domain_ids = NULL;
1648         }
1649
1650         g_iommus[iommu->seq_id] = NULL;
1651
1652         /* free context mapping */
1653         free_context_table(iommu);
1654 }
1655
1656 static struct dmar_domain *alloc_domain(int flags)
1657 {
1658         /* domain id for virtual machine, it won't be set in context */
1659         static atomic_t vm_domid = ATOMIC_INIT(0);
1660         struct dmar_domain *domain;
1661
1662         domain = alloc_domain_mem();
1663         if (!domain)
1664                 return NULL;
1665
1666         memset(domain, 0, sizeof(*domain));
1667         domain->nid = -1;
1668         domain->flags = flags;
1669         spin_lock_init(&domain->iommu_lock);
1670         INIT_LIST_HEAD(&domain->devices);
1671         if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1672                 domain->id = atomic_inc_return(&vm_domid);
1673
1674         return domain;
1675 }
1676
1677 static int __iommu_attach_domain(struct dmar_domain *domain,
1678                                  struct intel_iommu *iommu)
1679 {
1680         int num;
1681         unsigned long ndomains;
1682
1683         num = domain->iommu_did[iommu->seq_id];
1684         if (num)
1685                 return num;
1686
1687         ndomains = cap_ndoms(iommu->cap);
1688         num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1689
1690         if (num < ndomains) {
1691                 set_bit(num, iommu->domain_ids);
1692                 set_iommu_domain(iommu, num, domain);
1693                 domain->iommu_did[iommu->seq_id] = num;
1694         } else {
1695                 num = -ENOSPC;
1696         }
1697
1698         if (num < 0)
1699                 pr_err("%s: No free domain ids\n", iommu->name);
1700
1701         return num;
1702 }
1703
1704 static int iommu_attach_domain(struct dmar_domain *domain,
1705                                struct intel_iommu *iommu)
1706 {
1707         int num;
1708         unsigned long flags;
1709
1710         spin_lock_irqsave(&iommu->lock, flags);
1711         num = __iommu_attach_domain(domain, iommu);
1712         spin_unlock_irqrestore(&iommu->lock, flags);
1713
1714         return num;
1715 }
1716
1717 static void iommu_detach_domain(struct dmar_domain *domain,
1718                                 struct intel_iommu *iommu)
1719 {
1720         unsigned long flags;
1721         int num;
1722
1723         spin_lock_irqsave(&iommu->lock, flags);
1724
1725         num = domain->iommu_did[iommu->seq_id];
1726
1727         if (num == 0)
1728                 return;
1729
1730         clear_bit(num, iommu->domain_ids);
1731         set_iommu_domain(iommu, num, NULL);
1732
1733         spin_unlock_irqrestore(&iommu->lock, flags);
1734 }
1735
1736 static void domain_attach_iommu(struct dmar_domain *domain,
1737                                struct intel_iommu *iommu)
1738 {
1739         unsigned long flags;
1740
1741         spin_lock_irqsave(&domain->iommu_lock, flags);
1742         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1743                 domain->iommu_count++;
1744                 if (domain->iommu_count == 1)
1745                         domain->nid = iommu->node;
1746                 domain_update_iommu_cap(domain);
1747         }
1748         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1749 }
1750
1751 static int domain_detach_iommu(struct dmar_domain *domain,
1752                                struct intel_iommu *iommu)
1753 {
1754         unsigned long flags;
1755         int count = INT_MAX;
1756
1757         spin_lock_irqsave(&domain->iommu_lock, flags);
1758         if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1759                 count = --domain->iommu_count;
1760                 domain_update_iommu_cap(domain);
1761                 domain->iommu_did[iommu->seq_id] = 0;
1762         }
1763         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1764
1765         return count;
1766 }
1767
1768 static struct iova_domain reserved_iova_list;
1769 static struct lock_class_key reserved_rbtree_key;
1770
1771 static int dmar_init_reserved_ranges(void)
1772 {
1773         struct pci_dev *pdev = NULL;
1774         struct iova *iova;
1775         int i;
1776
1777         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1778                         DMA_32BIT_PFN);
1779
1780         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1781                 &reserved_rbtree_key);
1782
1783         /* IOAPIC ranges shouldn't be accessed by DMA */
1784         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1785                 IOVA_PFN(IOAPIC_RANGE_END));
1786         if (!iova) {
1787                 pr_err("Reserve IOAPIC range failed\n");
1788                 return -ENODEV;
1789         }
1790
1791         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1792         for_each_pci_dev(pdev) {
1793                 struct resource *r;
1794
1795                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1796                         r = &pdev->resource[i];
1797                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1798                                 continue;
1799                         iova = reserve_iova(&reserved_iova_list,
1800                                             IOVA_PFN(r->start),
1801                                             IOVA_PFN(r->end));
1802                         if (!iova) {
1803                                 pr_err("Reserve iova failed\n");
1804                                 return -ENODEV;
1805                         }
1806                 }
1807         }
1808         return 0;
1809 }
1810
1811 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1812 {
1813         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1814 }
1815
1816 static inline int guestwidth_to_adjustwidth(int gaw)
1817 {
1818         int agaw;
1819         int r = (gaw - 12) % 9;
1820
1821         if (r == 0)
1822                 agaw = gaw;
1823         else
1824                 agaw = gaw + 9 - r;
1825         if (agaw > 64)
1826                 agaw = 64;
1827         return agaw;
1828 }
1829
1830 static int domain_init(struct dmar_domain *domain, int guest_width)
1831 {
1832         struct intel_iommu *iommu;
1833         int adjust_width, agaw;
1834         unsigned long sagaw;
1835
1836         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1837                         DMA_32BIT_PFN);
1838         domain_reserve_special_ranges(domain);
1839
1840         /* calculate AGAW */
1841         iommu = domain_get_iommu(domain);
1842         if (guest_width > cap_mgaw(iommu->cap))
1843                 guest_width = cap_mgaw(iommu->cap);
1844         domain->gaw = guest_width;
1845         adjust_width = guestwidth_to_adjustwidth(guest_width);
1846         agaw = width_to_agaw(adjust_width);
1847         sagaw = cap_sagaw(iommu->cap);
1848         if (!test_bit(agaw, &sagaw)) {
1849                 /* hardware doesn't support it, choose a bigger one */
1850                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1851                 agaw = find_next_bit(&sagaw, 5, agaw);
1852                 if (agaw >= 5)
1853                         return -ENODEV;
1854         }
1855         domain->agaw = agaw;
1856
1857         if (ecap_coherent(iommu->ecap))
1858                 domain->iommu_coherency = 1;
1859         else
1860                 domain->iommu_coherency = 0;
1861
1862         if (ecap_sc_support(iommu->ecap))
1863                 domain->iommu_snooping = 1;
1864         else
1865                 domain->iommu_snooping = 0;
1866
1867         if (intel_iommu_superpage)
1868                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1869         else
1870                 domain->iommu_superpage = 0;
1871
1872         domain->nid = iommu->node;
1873
1874         /* always allocate the top pgd */
1875         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1876         if (!domain->pgd)
1877                 return -ENOMEM;
1878         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1879         return 0;
1880 }
1881
1882 static void domain_exit(struct dmar_domain *domain)
1883 {
1884         struct dmar_drhd_unit *drhd;
1885         struct intel_iommu *iommu;
1886         struct page *freelist = NULL;
1887
1888         /* Domain 0 is reserved, so dont process it */
1889         if (!domain)
1890                 return;
1891
1892         /* Flush any lazy unmaps that may reference this domain */
1893         if (!intel_iommu_strict)
1894                 flush_unmaps_timeout(0);
1895
1896         /* remove associated devices */
1897         domain_remove_dev_info(domain);
1898
1899         /* destroy iovas */
1900         put_iova_domain(&domain->iovad);
1901
1902         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1903
1904         /* clear attached or cached domains */
1905         rcu_read_lock();
1906         for_each_active_iommu(iommu, drhd)
1907                 if (domain_type_is_vm(domain) ||
1908                     test_bit(iommu->seq_id, domain->iommu_bmp))
1909                         iommu_detach_domain(domain, iommu);
1910         rcu_read_unlock();
1911
1912         dma_free_pagelist(freelist);
1913
1914         free_domain_mem(domain);
1915 }
1916
1917 static int domain_context_mapping_one(struct dmar_domain *domain,
1918                                       struct intel_iommu *iommu,
1919                                       u8 bus, u8 devfn)
1920 {
1921         int translation = CONTEXT_TT_MULTI_LEVEL;
1922         struct device_domain_info *info = NULL;
1923         struct context_entry *context;
1924         unsigned long flags;
1925         struct dma_pte *pgd;
1926         int id;
1927         int agaw;
1928
1929         if (hw_pass_through && domain_type_is_si(domain))
1930                 translation = CONTEXT_TT_PASS_THROUGH;
1931
1932         pr_debug("Set context mapping for %02x:%02x.%d\n",
1933                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1934
1935         BUG_ON(!domain->pgd);
1936
1937         spin_lock_irqsave(&iommu->lock, flags);
1938         context = iommu_context_addr(iommu, bus, devfn, 1);
1939         spin_unlock_irqrestore(&iommu->lock, flags);
1940         if (!context)
1941                 return -ENOMEM;
1942         spin_lock_irqsave(&iommu->lock, flags);
1943         if (context_present(context)) {
1944                 spin_unlock_irqrestore(&iommu->lock, flags);
1945                 return 0;
1946         }
1947
1948         pgd = domain->pgd;
1949
1950         id = __iommu_attach_domain(domain, iommu);
1951         if (id < 0) {
1952                 spin_unlock_irqrestore(&iommu->lock, flags);
1953                 pr_err("%s: No free domain ids\n", iommu->name);
1954                 return -EFAULT;
1955         }
1956
1957         context_clear_entry(context);
1958         context_set_domain_id(context, id);
1959
1960         /*
1961          * Skip top levels of page tables for iommu which has less agaw
1962          * than default.  Unnecessary for PT mode.
1963          */
1964         if (translation != CONTEXT_TT_PASS_THROUGH) {
1965                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1966                         pgd = phys_to_virt(dma_pte_addr(pgd));
1967                         if (!dma_pte_present(pgd)) {
1968                                 spin_unlock_irqrestore(&iommu->lock, flags);
1969                                 return -ENOMEM;
1970                         }
1971                 }
1972
1973                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1974                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1975                                      CONTEXT_TT_MULTI_LEVEL;
1976
1977                 context_set_address_root(context, virt_to_phys(pgd));
1978                 context_set_address_width(context, iommu->agaw);
1979         } else {
1980                 /*
1981                  * In pass through mode, AW must be programmed to
1982                  * indicate the largest AGAW value supported by
1983                  * hardware. And ASR is ignored by hardware.
1984                  */
1985                 context_set_address_width(context, iommu->msagaw);
1986         }
1987
1988         context_set_translation_type(context, translation);
1989         context_set_fault_enable(context);
1990         context_set_present(context);
1991         domain_flush_cache(domain, context, sizeof(*context));
1992
1993         /*
1994          * It's a non-present to present mapping. If hardware doesn't cache
1995          * non-present entry we only need to flush the write-buffer. If the
1996          * _does_ cache non-present entries, then it does so in the special
1997          * domain #0, which we have to flush:
1998          */
1999         if (cap_caching_mode(iommu->cap)) {
2000                 iommu->flush.flush_context(iommu, 0,
2001                                            (((u16)bus) << 8) | devfn,
2002                                            DMA_CCMD_MASK_NOBIT,
2003                                            DMA_CCMD_DEVICE_INVL);
2004                 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
2005         } else {
2006                 iommu_flush_write_buffer(iommu);
2007         }
2008         iommu_enable_dev_iotlb(info);
2009         spin_unlock_irqrestore(&iommu->lock, flags);
2010
2011         domain_attach_iommu(domain, iommu);
2012
2013         return 0;
2014 }
2015
2016 struct domain_context_mapping_data {
2017         struct dmar_domain *domain;
2018         struct intel_iommu *iommu;
2019 };
2020
2021 static int domain_context_mapping_cb(struct pci_dev *pdev,
2022                                      u16 alias, void *opaque)
2023 {
2024         struct domain_context_mapping_data *data = opaque;
2025
2026         return domain_context_mapping_one(data->domain, data->iommu,
2027                                           PCI_BUS_NUM(alias), alias & 0xff);
2028 }
2029
2030 static int
2031 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2032 {
2033         struct intel_iommu *iommu;
2034         u8 bus, devfn;
2035         struct domain_context_mapping_data data;
2036
2037         iommu = device_to_iommu(dev, &bus, &devfn);
2038         if (!iommu)
2039                 return -ENODEV;
2040
2041         if (!dev_is_pci(dev))
2042                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2043
2044         data.domain = domain;
2045         data.iommu = iommu;
2046
2047         return pci_for_each_dma_alias(to_pci_dev(dev),
2048                                       &domain_context_mapping_cb, &data);
2049 }
2050
2051 static int domain_context_mapped_cb(struct pci_dev *pdev,
2052                                     u16 alias, void *opaque)
2053 {
2054         struct intel_iommu *iommu = opaque;
2055
2056         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2057 }
2058
2059 static int domain_context_mapped(struct device *dev)
2060 {
2061         struct intel_iommu *iommu;
2062         u8 bus, devfn;
2063
2064         iommu = device_to_iommu(dev, &bus, &devfn);
2065         if (!iommu)
2066                 return -ENODEV;
2067
2068         if (!dev_is_pci(dev))
2069                 return device_context_mapped(iommu, bus, devfn);
2070
2071         return !pci_for_each_dma_alias(to_pci_dev(dev),
2072                                        domain_context_mapped_cb, iommu);
2073 }
2074
2075 /* Returns a number of VTD pages, but aligned to MM page size */
2076 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2077                                             size_t size)
2078 {
2079         host_addr &= ~PAGE_MASK;
2080         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2081 }
2082
2083 /* Return largest possible superpage level for a given mapping */
2084 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2085                                           unsigned long iov_pfn,
2086                                           unsigned long phy_pfn,
2087                                           unsigned long pages)
2088 {
2089         int support, level = 1;
2090         unsigned long pfnmerge;
2091
2092         support = domain->iommu_superpage;
2093
2094         /* To use a large page, the virtual *and* physical addresses
2095            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2096            of them will mean we have to use smaller pages. So just
2097            merge them and check both at once. */
2098         pfnmerge = iov_pfn | phy_pfn;
2099
2100         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2101                 pages >>= VTD_STRIDE_SHIFT;
2102                 if (!pages)
2103                         break;
2104                 pfnmerge >>= VTD_STRIDE_SHIFT;
2105                 level++;
2106                 support--;
2107         }
2108         return level;
2109 }
2110
2111 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2112                             struct scatterlist *sg, unsigned long phys_pfn,
2113                             unsigned long nr_pages, int prot)
2114 {
2115         struct dma_pte *first_pte = NULL, *pte = NULL;
2116         phys_addr_t uninitialized_var(pteval);
2117         unsigned long sg_res = 0;
2118         unsigned int largepage_lvl = 0;
2119         unsigned long lvl_pages = 0;
2120
2121         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2122
2123         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2124                 return -EINVAL;
2125
2126         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2127
2128         if (!sg) {
2129                 sg_res = nr_pages;
2130                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2131         }
2132
2133         while (nr_pages > 0) {
2134                 uint64_t tmp;
2135
2136                 if (!sg_res) {
2137                         sg_res = aligned_nrpages(sg->offset, sg->length);
2138                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2139                         sg->dma_length = sg->length;
2140                         pteval = page_to_phys(sg_page(sg)) | prot;
2141                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2142                 }
2143
2144                 if (!pte) {
2145                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2146
2147                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2148                         if (!pte)
2149                                 return -ENOMEM;
2150                         /* It is large page*/
2151                         if (largepage_lvl > 1) {
2152                                 pteval |= DMA_PTE_LARGE_PAGE;
2153                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2154                                 /*
2155                                  * Ensure that old small page tables are
2156                                  * removed to make room for superpage,
2157                                  * if they exist.
2158                                  */
2159                                 dma_pte_free_pagetable(domain, iov_pfn,
2160                                                        iov_pfn + lvl_pages - 1);
2161                         } else {
2162                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2163                         }
2164
2165                 }
2166                 /* We don't need lock here, nobody else
2167                  * touches the iova range
2168                  */
2169                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2170                 if (tmp) {
2171                         static int dumps = 5;
2172                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2173                                 iov_pfn, tmp, (unsigned long long)pteval);
2174                         if (dumps) {
2175                                 dumps--;
2176                                 debug_dma_dump_mappings(NULL);
2177                         }
2178                         WARN_ON(1);
2179                 }
2180
2181                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2182
2183                 BUG_ON(nr_pages < lvl_pages);
2184                 BUG_ON(sg_res < lvl_pages);
2185
2186                 nr_pages -= lvl_pages;
2187                 iov_pfn += lvl_pages;
2188                 phys_pfn += lvl_pages;
2189                 pteval += lvl_pages * VTD_PAGE_SIZE;
2190                 sg_res -= lvl_pages;
2191
2192                 /* If the next PTE would be the first in a new page, then we
2193                    need to flush the cache on the entries we've just written.
2194                    And then we'll need to recalculate 'pte', so clear it and
2195                    let it get set again in the if (!pte) block above.
2196
2197                    If we're done (!nr_pages) we need to flush the cache too.
2198
2199                    Also if we've been setting superpages, we may need to
2200                    recalculate 'pte' and switch back to smaller pages for the
2201                    end of the mapping, if the trailing size is not enough to
2202                    use another superpage (i.e. sg_res < lvl_pages). */
2203                 pte++;
2204                 if (!nr_pages || first_pte_in_page(pte) ||
2205                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2206                         domain_flush_cache(domain, first_pte,
2207                                            (void *)pte - (void *)first_pte);
2208                         pte = NULL;
2209                 }
2210
2211                 if (!sg_res && nr_pages)
2212                         sg = sg_next(sg);
2213         }
2214         return 0;
2215 }
2216
2217 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2218                                     struct scatterlist *sg, unsigned long nr_pages,
2219                                     int prot)
2220 {
2221         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2222 }
2223
2224 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2225                                      unsigned long phys_pfn, unsigned long nr_pages,
2226                                      int prot)
2227 {
2228         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2229 }
2230
2231 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2232 {
2233         if (!iommu)
2234                 return;
2235
2236         clear_context_table(iommu, bus, devfn);
2237         iommu->flush.flush_context(iommu, 0, 0, 0,
2238                                            DMA_CCMD_GLOBAL_INVL);
2239         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2240 }
2241
2242 static inline void unlink_domain_info(struct device_domain_info *info)
2243 {
2244         assert_spin_locked(&device_domain_lock);
2245         list_del(&info->link);
2246         list_del(&info->global);
2247         if (info->dev)
2248                 info->dev->archdata.iommu = NULL;
2249 }
2250
2251 static void domain_remove_dev_info(struct dmar_domain *domain)
2252 {
2253         struct device_domain_info *info, *tmp;
2254         unsigned long flags;
2255
2256         spin_lock_irqsave(&device_domain_lock, flags);
2257         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2258                 unlink_domain_info(info);
2259                 spin_unlock_irqrestore(&device_domain_lock, flags);
2260
2261                 iommu_disable_dev_iotlb(info);
2262                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2263
2264                 if (domain_type_is_vm(domain)) {
2265                         iommu_detach_dependent_devices(info->iommu, info->dev);
2266                         domain_detach_iommu(domain, info->iommu);
2267                 }
2268
2269                 free_devinfo_mem(info);
2270                 spin_lock_irqsave(&device_domain_lock, flags);
2271         }
2272         spin_unlock_irqrestore(&device_domain_lock, flags);
2273 }
2274
2275 /*
2276  * find_domain
2277  * Note: we use struct device->archdata.iommu stores the info
2278  */
2279 static struct dmar_domain *find_domain(struct device *dev)
2280 {
2281         struct device_domain_info *info;
2282
2283         /* No lock here, assumes no domain exit in normal case */
2284         info = dev->archdata.iommu;
2285         if (info)
2286                 return info->domain;
2287         return NULL;
2288 }
2289
2290 static inline struct device_domain_info *
2291 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2292 {
2293         struct device_domain_info *info;
2294
2295         list_for_each_entry(info, &device_domain_list, global)
2296                 if (info->iommu->segment == segment && info->bus == bus &&
2297                     info->devfn == devfn)
2298                         return info;
2299
2300         return NULL;
2301 }
2302
2303 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2304                                                 int bus, int devfn,
2305                                                 struct device *dev,
2306                                                 struct dmar_domain *domain)
2307 {
2308         struct dmar_domain *found = NULL;
2309         struct device_domain_info *info;
2310         unsigned long flags;
2311
2312         info = alloc_devinfo_mem();
2313         if (!info)
2314                 return NULL;
2315
2316         info->bus = bus;
2317         info->devfn = devfn;
2318         info->dev = dev;
2319         info->domain = domain;
2320         info->iommu = iommu;
2321
2322         spin_lock_irqsave(&device_domain_lock, flags);
2323         if (dev)
2324                 found = find_domain(dev);
2325         else {
2326                 struct device_domain_info *info2;
2327                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2328                 if (info2)
2329                         found = info2->domain;
2330         }
2331         if (found) {
2332                 spin_unlock_irqrestore(&device_domain_lock, flags);
2333                 free_devinfo_mem(info);
2334                 /* Caller must free the original domain */
2335                 return found;
2336         }
2337
2338         list_add(&info->link, &domain->devices);
2339         list_add(&info->global, &device_domain_list);
2340         if (dev)
2341                 dev->archdata.iommu = info;
2342         spin_unlock_irqrestore(&device_domain_lock, flags);
2343
2344         return domain;
2345 }
2346
2347 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2348 {
2349         *(u16 *)opaque = alias;
2350         return 0;
2351 }
2352
2353 /* domain is initialized */
2354 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2355 {
2356         struct dmar_domain *domain, *tmp;
2357         struct intel_iommu *iommu;
2358         struct device_domain_info *info;
2359         u16 dma_alias;
2360         unsigned long flags;
2361         u8 bus, devfn;
2362
2363         domain = find_domain(dev);
2364         if (domain)
2365                 return domain;
2366
2367         iommu = device_to_iommu(dev, &bus, &devfn);
2368         if (!iommu)
2369                 return NULL;
2370
2371         if (dev_is_pci(dev)) {
2372                 struct pci_dev *pdev = to_pci_dev(dev);
2373
2374                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2375
2376                 spin_lock_irqsave(&device_domain_lock, flags);
2377                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2378                                                       PCI_BUS_NUM(dma_alias),
2379                                                       dma_alias & 0xff);
2380                 if (info) {
2381                         iommu = info->iommu;
2382                         domain = info->domain;
2383                 }
2384                 spin_unlock_irqrestore(&device_domain_lock, flags);
2385
2386                 /* DMA alias already has a domain, uses it */
2387                 if (info)
2388                         goto found_domain;
2389         }
2390
2391         /* Allocate and initialize new domain for the device */
2392         domain = alloc_domain(0);
2393         if (!domain)
2394                 return NULL;
2395         domain->id = iommu_attach_domain(domain, iommu);
2396         if (domain->id < 0) {
2397                 free_domain_mem(domain);
2398                 return NULL;
2399         }
2400         domain_attach_iommu(domain, iommu);
2401         if (domain_init(domain, gaw)) {
2402                 domain_exit(domain);
2403                 return NULL;
2404         }
2405
2406         /* register PCI DMA alias device */
2407         if (dev_is_pci(dev)) {
2408                 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2409                                            dma_alias & 0xff, NULL, domain);
2410
2411                 if (!tmp || tmp != domain) {
2412                         domain_exit(domain);
2413                         domain = tmp;
2414                 }
2415
2416                 if (!domain)
2417                         return NULL;
2418         }
2419
2420 found_domain:
2421         tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2422
2423         if (!tmp || tmp != domain) {
2424                 domain_exit(domain);
2425                 domain = tmp;
2426         }
2427
2428         return domain;
2429 }
2430
2431 static int iommu_identity_mapping;
2432 #define IDENTMAP_ALL            1
2433 #define IDENTMAP_GFX            2
2434 #define IDENTMAP_AZALIA         4
2435
2436 static int iommu_domain_identity_map(struct dmar_domain *domain,
2437                                      unsigned long long start,
2438                                      unsigned long long end)
2439 {
2440         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2441         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2442
2443         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2444                           dma_to_mm_pfn(last_vpfn))) {
2445                 pr_err("Reserving iova failed\n");
2446                 return -ENOMEM;
2447         }
2448
2449         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2450                  start, end, domain->id);
2451         /*
2452          * RMRR range might have overlap with physical memory range,
2453          * clear it first
2454          */
2455         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2456
2457         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2458                                   last_vpfn - first_vpfn + 1,
2459                                   DMA_PTE_READ|DMA_PTE_WRITE);
2460 }
2461
2462 static int iommu_prepare_identity_map(struct device *dev,
2463                                       unsigned long long start,
2464                                       unsigned long long end)
2465 {
2466         struct dmar_domain *domain;
2467         int ret;
2468
2469         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2470         if (!domain)
2471                 return -ENOMEM;
2472
2473         /* For _hardware_ passthrough, don't bother. But for software
2474            passthrough, we do it anyway -- it may indicate a memory
2475            range which is reserved in E820, so which didn't get set
2476            up to start with in si_domain */
2477         if (domain == si_domain && hw_pass_through) {
2478                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2479                         dev_name(dev), start, end);
2480                 return 0;
2481         }
2482
2483         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2484                 dev_name(dev), start, end);
2485
2486         if (end < start) {
2487                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2488                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2489                         dmi_get_system_info(DMI_BIOS_VENDOR),
2490                         dmi_get_system_info(DMI_BIOS_VERSION),
2491                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2492                 ret = -EIO;
2493                 goto error;
2494         }
2495
2496         if (end >> agaw_to_width(domain->agaw)) {
2497                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2498                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2499                      agaw_to_width(domain->agaw),
2500                      dmi_get_system_info(DMI_BIOS_VENDOR),
2501                      dmi_get_system_info(DMI_BIOS_VERSION),
2502                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2503                 ret = -EIO;
2504                 goto error;
2505         }
2506
2507         ret = iommu_domain_identity_map(domain, start, end);
2508         if (ret)
2509                 goto error;
2510
2511         /* context entry init */
2512         ret = domain_context_mapping(domain, dev);
2513         if (ret)
2514                 goto error;
2515
2516         return 0;
2517
2518  error:
2519         domain_exit(domain);
2520         return ret;
2521 }
2522
2523 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2524                                          struct device *dev)
2525 {
2526         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2527                 return 0;
2528         return iommu_prepare_identity_map(dev, rmrr->base_address,
2529                                           rmrr->end_address);
2530 }
2531
2532 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2533 static inline void iommu_prepare_isa(void)
2534 {
2535         struct pci_dev *pdev;
2536         int ret;
2537
2538         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2539         if (!pdev)
2540                 return;
2541
2542         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2543         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2544
2545         if (ret)
2546                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2547
2548         pci_dev_put(pdev);
2549 }
2550 #else
2551 static inline void iommu_prepare_isa(void)
2552 {
2553         return;
2554 }
2555 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2556
2557 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2558
2559 static int __init si_domain_init(int hw)
2560 {
2561         int nid, ret = 0;
2562
2563         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2564         if (!si_domain)
2565                 return -EFAULT;
2566
2567         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2568                 domain_exit(si_domain);
2569                 return -EFAULT;
2570         }
2571
2572         pr_debug("Identity mapping domain allocated\n");
2573
2574         if (hw)
2575                 return 0;
2576
2577         for_each_online_node(nid) {
2578                 unsigned long start_pfn, end_pfn;
2579                 int i;
2580
2581                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2582                         ret = iommu_domain_identity_map(si_domain,
2583                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2584                         if (ret)
2585                                 return ret;
2586                 }
2587         }
2588
2589         return 0;
2590 }
2591
2592 static int identity_mapping(struct device *dev)
2593 {
2594         struct device_domain_info *info;
2595
2596         if (likely(!iommu_identity_mapping))
2597                 return 0;
2598
2599         info = dev->archdata.iommu;
2600         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2601                 return (info->domain == si_domain);
2602
2603         return 0;
2604 }
2605
2606 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2607 {
2608         struct dmar_domain *ndomain;
2609         struct intel_iommu *iommu;
2610         u8 bus, devfn;
2611         int ret;
2612
2613         iommu = device_to_iommu(dev, &bus, &devfn);
2614         if (!iommu)
2615                 return -ENODEV;
2616
2617         ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2618         if (ndomain != domain)
2619                 return -EBUSY;
2620
2621         ret = domain_context_mapping(domain, dev);
2622         if (ret) {
2623                 domain_remove_one_dev_info(domain, dev);
2624                 return ret;
2625         }
2626
2627         return 0;
2628 }
2629
2630 static bool device_has_rmrr(struct device *dev)
2631 {
2632         struct dmar_rmrr_unit *rmrr;
2633         struct device *tmp;
2634         int i;
2635
2636         rcu_read_lock();
2637         for_each_rmrr_units(rmrr) {
2638                 /*
2639                  * Return TRUE if this RMRR contains the device that
2640                  * is passed in.
2641                  */
2642                 for_each_active_dev_scope(rmrr->devices,
2643                                           rmrr->devices_cnt, i, tmp)
2644                         if (tmp == dev) {
2645                                 rcu_read_unlock();
2646                                 return true;
2647                         }
2648         }
2649         rcu_read_unlock();
2650         return false;
2651 }
2652
2653 /*
2654  * There are a couple cases where we need to restrict the functionality of
2655  * devices associated with RMRRs.  The first is when evaluating a device for
2656  * identity mapping because problems exist when devices are moved in and out
2657  * of domains and their respective RMRR information is lost.  This means that
2658  * a device with associated RMRRs will never be in a "passthrough" domain.
2659  * The second is use of the device through the IOMMU API.  This interface
2660  * expects to have full control of the IOVA space for the device.  We cannot
2661  * satisfy both the requirement that RMRR access is maintained and have an
2662  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2663  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2664  * We therefore prevent devices associated with an RMRR from participating in
2665  * the IOMMU API, which eliminates them from device assignment.
2666  *
2667  * In both cases we assume that PCI USB devices with RMRRs have them largely
2668  * for historical reasons and that the RMRR space is not actively used post
2669  * boot.  This exclusion may change if vendors begin to abuse it.
2670  *
2671  * The same exception is made for graphics devices, with the requirement that
2672  * any use of the RMRR regions will be torn down before assigning the device
2673  * to a guest.
2674  */
2675 static bool device_is_rmrr_locked(struct device *dev)
2676 {
2677         if (!device_has_rmrr(dev))
2678                 return false;
2679
2680         if (dev_is_pci(dev)) {
2681                 struct pci_dev *pdev = to_pci_dev(dev);
2682
2683                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2684                         return false;
2685         }
2686
2687         return true;
2688 }
2689
2690 static int iommu_should_identity_map(struct device *dev, int startup)
2691 {
2692
2693         if (dev_is_pci(dev)) {
2694                 struct pci_dev *pdev = to_pci_dev(dev);
2695
2696                 if (device_is_rmrr_locked(dev))
2697                         return 0;
2698
2699                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2700                         return 1;
2701
2702                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2703                         return 1;
2704
2705                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2706                         return 0;
2707
2708                 /*
2709                  * We want to start off with all devices in the 1:1 domain, and
2710                  * take them out later if we find they can't access all of memory.
2711                  *
2712                  * However, we can't do this for PCI devices behind bridges,
2713                  * because all PCI devices behind the same bridge will end up
2714                  * with the same source-id on their transactions.
2715                  *
2716                  * Practically speaking, we can't change things around for these
2717                  * devices at run-time, because we can't be sure there'll be no
2718                  * DMA transactions in flight for any of their siblings.
2719                  *
2720                  * So PCI devices (unless they're on the root bus) as well as
2721                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2722                  * the 1:1 domain, just in _case_ one of their siblings turns out
2723                  * not to be able to map all of memory.
2724                  */
2725                 if (!pci_is_pcie(pdev)) {
2726                         if (!pci_is_root_bus(pdev->bus))
2727                                 return 0;
2728                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2729                                 return 0;
2730                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2731                         return 0;
2732         } else {
2733                 if (device_has_rmrr(dev))
2734                         return 0;
2735         }
2736
2737         /*
2738          * At boot time, we don't yet know if devices will be 64-bit capable.
2739          * Assume that they will — if they turn out not to be, then we can
2740          * take them out of the 1:1 domain later.
2741          */
2742         if (!startup) {
2743                 /*
2744                  * If the device's dma_mask is less than the system's memory
2745                  * size then this is not a candidate for identity mapping.
2746                  */
2747                 u64 dma_mask = *dev->dma_mask;
2748
2749                 if (dev->coherent_dma_mask &&
2750                     dev->coherent_dma_mask < dma_mask)
2751                         dma_mask = dev->coherent_dma_mask;
2752
2753                 return dma_mask >= dma_get_required_mask(dev);
2754         }
2755
2756         return 1;
2757 }
2758
2759 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2760 {
2761         int ret;
2762
2763         if (!iommu_should_identity_map(dev, 1))
2764                 return 0;
2765
2766         ret = domain_add_dev_info(si_domain, dev);
2767         if (!ret)
2768                 pr_info("%s identity mapping for device %s\n",
2769                         hw ? "Hardware" : "Software", dev_name(dev));
2770         else if (ret == -ENODEV)
2771                 /* device not associated with an iommu */
2772                 ret = 0;
2773
2774         return ret;
2775 }
2776
2777
2778 static int __init iommu_prepare_static_identity_mapping(int hw)
2779 {
2780         struct pci_dev *pdev = NULL;
2781         struct dmar_drhd_unit *drhd;
2782         struct intel_iommu *iommu;
2783         struct device *dev;
2784         int i;
2785         int ret = 0;
2786
2787         for_each_pci_dev(pdev) {
2788                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2789                 if (ret)
2790                         return ret;
2791         }
2792
2793         for_each_active_iommu(iommu, drhd)
2794                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2795                         struct acpi_device_physical_node *pn;
2796                         struct acpi_device *adev;
2797
2798                         if (dev->bus != &acpi_bus_type)
2799                                 continue;
2800
2801                         adev= to_acpi_device(dev);
2802                         mutex_lock(&adev->physical_node_lock);
2803                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2804                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2805                                 if (ret)
2806                                         break;
2807                         }
2808                         mutex_unlock(&adev->physical_node_lock);
2809                         if (ret)
2810                                 return ret;
2811                 }
2812
2813         return 0;
2814 }
2815
2816 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2817 {
2818         /*
2819          * Start from the sane iommu hardware state.
2820          * If the queued invalidation is already initialized by us
2821          * (for example, while enabling interrupt-remapping) then
2822          * we got the things already rolling from a sane state.
2823          */
2824         if (!iommu->qi) {
2825                 /*
2826                  * Clear any previous faults.
2827                  */
2828                 dmar_fault(-1, iommu);
2829                 /*
2830                  * Disable queued invalidation if supported and already enabled
2831                  * before OS handover.
2832                  */
2833                 dmar_disable_qi(iommu);
2834         }
2835
2836         if (dmar_enable_qi(iommu)) {
2837                 /*
2838                  * Queued Invalidate not enabled, use Register Based Invalidate
2839                  */
2840                 iommu->flush.flush_context = __iommu_flush_context;
2841                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2842                 pr_info("%s: Using Register based invalidation\n",
2843                         iommu->name);
2844         } else {
2845                 iommu->flush.flush_context = qi_flush_context;
2846                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2847                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2848         }
2849 }
2850
2851 static int copy_context_table(struct intel_iommu *iommu,
2852                               struct root_entry *old_re,
2853                               struct context_entry **tbl,
2854                               int bus, bool ext)
2855 {
2856         struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
2857         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2858         phys_addr_t old_ce_phys;
2859
2860         tbl_idx = ext ? bus * 2 : bus;
2861
2862         for (devfn = 0; devfn < 256; devfn++) {
2863                 /* First calculate the correct index */
2864                 idx = (ext ? devfn * 2 : devfn) % 256;
2865
2866                 if (idx == 0) {
2867                         /* First save what we may have and clean up */
2868                         if (new_ce) {
2869                                 tbl[tbl_idx] = new_ce;
2870                                 __iommu_flush_cache(iommu, new_ce,
2871                                                     VTD_PAGE_SIZE);
2872                                 pos = 1;
2873                         }
2874
2875                         if (old_ce)
2876                                 iounmap(old_ce);
2877
2878                         ret = 0;
2879                         if (devfn < 0x80)
2880                                 old_ce_phys = root_entry_lctp(old_re);
2881                         else
2882                                 old_ce_phys = root_entry_uctp(old_re);
2883
2884                         if (!old_ce_phys) {
2885                                 if (ext && devfn == 0) {
2886                                         /* No LCTP, try UCTP */
2887                                         devfn = 0x7f;
2888                                         continue;
2889                                 } else {
2890                                         goto out;
2891                                 }
2892                         }
2893
2894                         ret = -ENOMEM;
2895                         old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2896                         if (!old_ce)
2897                                 goto out;
2898
2899                         new_ce = alloc_pgtable_page(iommu->node);
2900                         if (!new_ce)
2901                                 goto out_unmap;
2902
2903                         ret = 0;
2904                 }
2905
2906                 /* Now copy the context entry */
2907                 ce = old_ce[idx];
2908
2909                 if (!__context_present(&ce))
2910                         continue;
2911
2912                 did = context_domain_id(&ce);
2913                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2914                         set_bit(did, iommu->domain_ids);
2915
2916                 /*
2917                  * We need a marker for copied context entries. This
2918                  * marker needs to work for the old format as well as
2919                  * for extended context entries.
2920                  *
2921                  * Bit 67 of the context entry is used. In the old
2922                  * format this bit is available to software, in the
2923                  * extended format it is the PGE bit, but PGE is ignored
2924                  * by HW if PASIDs are disabled (and thus still
2925                  * available).
2926                  *
2927                  * So disable PASIDs first and then mark the entry
2928                  * copied. This means that we don't copy PASID
2929                  * translations from the old kernel, but this is fine as
2930                  * faults there are not fatal.
2931                  */
2932                 context_clear_pasid_enable(&ce);
2933                 context_set_copied(&ce);
2934
2935                 new_ce[idx] = ce;
2936         }
2937
2938         tbl[tbl_idx + pos] = new_ce;
2939
2940         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2941
2942 out_unmap:
2943         iounmap(old_ce);
2944
2945 out:
2946         return ret;
2947 }
2948
2949 static int copy_translation_tables(struct intel_iommu *iommu)
2950 {
2951         struct context_entry **ctxt_tbls;
2952         struct root_entry *old_rt;
2953         phys_addr_t old_rt_phys;
2954         int ctxt_table_entries;
2955         unsigned long flags;
2956         u64 rtaddr_reg;
2957         int bus, ret;
2958         bool new_ext, ext;
2959
2960         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2961         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2962         new_ext    = !!ecap_ecs(iommu->ecap);
2963
2964         /*
2965          * The RTT bit can only be changed when translation is disabled,
2966          * but disabling translation means to open a window for data
2967          * corruption. So bail out and don't copy anything if we would
2968          * have to change the bit.
2969          */
2970         if (new_ext != ext)
2971                 return -EINVAL;
2972
2973         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2974         if (!old_rt_phys)
2975                 return -EINVAL;
2976
2977         old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2978         if (!old_rt)
2979                 return -ENOMEM;
2980
2981         /* This is too big for the stack - allocate it from slab */
2982         ctxt_table_entries = ext ? 512 : 256;
2983         ret = -ENOMEM;
2984         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2985         if (!ctxt_tbls)
2986                 goto out_unmap;
2987
2988         for (bus = 0; bus < 256; bus++) {
2989                 ret = copy_context_table(iommu, &old_rt[bus],
2990                                          ctxt_tbls, bus, ext);
2991                 if (ret) {
2992                         pr_err("%s: Failed to copy context table for bus %d\n",
2993                                 iommu->name, bus);
2994                         continue;
2995                 }
2996         }
2997
2998         spin_lock_irqsave(&iommu->lock, flags);
2999
3000         /* Context tables are copied, now write them to the root_entry table */
3001         for (bus = 0; bus < 256; bus++) {
3002                 int idx = ext ? bus * 2 : bus;
3003                 u64 val;
3004
3005                 if (ctxt_tbls[idx]) {
3006                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3007                         iommu->root_entry[bus].lo = val;
3008                 }
3009
3010                 if (!ext || !ctxt_tbls[idx + 1])
3011                         continue;
3012
3013                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3014                 iommu->root_entry[bus].hi = val;
3015         }
3016
3017         spin_unlock_irqrestore(&iommu->lock, flags);
3018
3019         kfree(ctxt_tbls);
3020
3021         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3022
3023         ret = 0;
3024
3025 out_unmap:
3026         iounmap(old_rt);
3027
3028         return ret;
3029 }
3030
3031 static int __init init_dmars(void)
3032 {
3033         struct dmar_drhd_unit *drhd;
3034         struct dmar_rmrr_unit *rmrr;
3035         bool copied_tables = false;
3036         struct device *dev;
3037         struct intel_iommu *iommu;
3038         int i, ret;
3039
3040         /*
3041          * for each drhd
3042          *    allocate root
3043          *    initialize and program root entry to not present
3044          * endfor
3045          */
3046         for_each_drhd_unit(drhd) {
3047                 /*
3048                  * lock not needed as this is only incremented in the single
3049                  * threaded kernel __init code path all other access are read
3050                  * only
3051                  */
3052                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3053                         g_num_of_iommus++;
3054                         continue;
3055                 }
3056                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3057         }
3058
3059         /* Preallocate enough resources for IOMMU hot-addition */
3060         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3061                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3062
3063         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3064                         GFP_KERNEL);
3065         if (!g_iommus) {
3066                 pr_err("Allocating global iommu array failed\n");
3067                 ret = -ENOMEM;
3068                 goto error;
3069         }
3070
3071         deferred_flush = kzalloc(g_num_of_iommus *
3072                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3073         if (!deferred_flush) {
3074                 ret = -ENOMEM;
3075                 goto free_g_iommus;
3076         }
3077
3078         for_each_active_iommu(iommu, drhd) {
3079                 g_iommus[iommu->seq_id] = iommu;
3080
3081                 intel_iommu_init_qi(iommu);
3082
3083                 ret = iommu_init_domains(iommu);
3084                 if (ret)
3085                         goto free_iommu;
3086
3087                 init_translation_status(iommu);
3088
3089                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3090                         iommu_disable_translation(iommu);
3091                         clear_translation_pre_enabled(iommu);
3092                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3093                                 iommu->name);
3094                 }
3095
3096                 /*
3097                  * TBD:
3098                  * we could share the same root & context tables
3099                  * among all IOMMU's. Need to Split it later.
3100                  */
3101                 ret = iommu_alloc_root_entry(iommu);
3102                 if (ret)
3103                         goto free_iommu;
3104
3105                 if (translation_pre_enabled(iommu)) {
3106                         pr_info("Translation already enabled - trying to copy translation structures\n");
3107
3108                         ret = copy_translation_tables(iommu);
3109                         if (ret) {
3110                                 /*
3111                                  * We found the IOMMU with translation
3112                                  * enabled - but failed to copy over the
3113                                  * old root-entry table. Try to proceed
3114                                  * by disabling translation now and
3115                                  * allocating a clean root-entry table.
3116                                  * This might cause DMAR faults, but
3117                                  * probably the dump will still succeed.
3118                                  */
3119                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3120                                        iommu->name);
3121                                 iommu_disable_translation(iommu);
3122                                 clear_translation_pre_enabled(iommu);
3123                         } else {
3124                                 pr_info("Copied translation tables from previous kernel for %s\n",
3125                                         iommu->name);
3126                                 copied_tables = true;
3127                         }
3128                 }
3129
3130                 iommu_flush_write_buffer(iommu);
3131                 iommu_set_root_entry(iommu);
3132                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3133                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3134
3135                 if (!ecap_pass_through(iommu->ecap))
3136                         hw_pass_through = 0;
3137         }
3138
3139         if (iommu_pass_through)
3140                 iommu_identity_mapping |= IDENTMAP_ALL;
3141
3142 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3143         iommu_identity_mapping |= IDENTMAP_GFX;
3144 #endif
3145
3146         if (iommu_identity_mapping) {
3147                 ret = si_domain_init(hw_pass_through);
3148                 if (ret)
3149                         goto free_iommu;
3150         }
3151
3152         check_tylersburg_isoch();
3153
3154         /*
3155          * If we copied translations from a previous kernel in the kdump
3156          * case, we can not assign the devices to domains now, as that
3157          * would eliminate the old mappings. So skip this part and defer
3158          * the assignment to device driver initialization time.
3159          */
3160         if (copied_tables)
3161                 goto domains_done;
3162
3163         /*
3164          * If pass through is not set or not enabled, setup context entries for
3165          * identity mappings for rmrr, gfx, and isa and may fall back to static
3166          * identity mapping if iommu_identity_mapping is set.
3167          */
3168         if (iommu_identity_mapping) {
3169                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3170                 if (ret) {
3171                         pr_crit("Failed to setup IOMMU pass-through\n");
3172                         goto free_iommu;
3173                 }
3174         }
3175         /*
3176          * For each rmrr
3177          *   for each dev attached to rmrr
3178          *   do
3179          *     locate drhd for dev, alloc domain for dev
3180          *     allocate free domain
3181          *     allocate page table entries for rmrr
3182          *     if context not allocated for bus
3183          *           allocate and init context
3184          *           set present in root table for this bus
3185          *     init context with domain, translation etc
3186          *    endfor
3187          * endfor
3188          */
3189         pr_info("Setting RMRR:\n");
3190         for_each_rmrr_units(rmrr) {
3191                 /* some BIOS lists non-exist devices in DMAR table. */
3192                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3193                                           i, dev) {
3194                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3195                         if (ret)
3196                                 pr_err("Mapping reserved region failed\n");
3197                 }
3198         }
3199
3200         iommu_prepare_isa();
3201
3202 domains_done:
3203
3204         /*
3205          * for each drhd
3206          *   enable fault log
3207          *   global invalidate context cache
3208          *   global invalidate iotlb
3209          *   enable translation
3210          */
3211         for_each_iommu(iommu, drhd) {
3212                 if (drhd->ignored) {
3213                         /*
3214                          * we always have to disable PMRs or DMA may fail on
3215                          * this device
3216                          */
3217                         if (force_on)
3218                                 iommu_disable_protect_mem_regions(iommu);
3219                         continue;
3220                 }
3221
3222                 iommu_flush_write_buffer(iommu);
3223
3224                 ret = dmar_set_interrupt(iommu);
3225                 if (ret)
3226                         goto free_iommu;
3227
3228                 if (!translation_pre_enabled(iommu))
3229                         iommu_enable_translation(iommu);
3230
3231                 iommu_disable_protect_mem_regions(iommu);
3232         }
3233
3234         return 0;
3235
3236 free_iommu:
3237         for_each_active_iommu(iommu, drhd) {
3238                 disable_dmar_iommu(iommu);
3239                 free_dmar_iommu(iommu);
3240         }
3241         kfree(deferred_flush);
3242 free_g_iommus:
3243         kfree(g_iommus);
3244 error:
3245         return ret;
3246 }
3247
3248 /* This takes a number of _MM_ pages, not VTD pages */
3249 static struct iova *intel_alloc_iova(struct device *dev,
3250                                      struct dmar_domain *domain,
3251                                      unsigned long nrpages, uint64_t dma_mask)
3252 {
3253         struct iova *iova = NULL;
3254
3255         /* Restrict dma_mask to the width that the iommu can handle */
3256         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3257
3258         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3259                 /*
3260                  * First try to allocate an io virtual address in
3261                  * DMA_BIT_MASK(32) and if that fails then try allocating
3262                  * from higher range
3263                  */
3264                 iova = alloc_iova(&domain->iovad, nrpages,
3265                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3266                 if (iova)
3267                         return iova;
3268         }
3269         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3270         if (unlikely(!iova)) {
3271                 pr_err("Allocating %ld-page iova for %s failed",
3272                        nrpages, dev_name(dev));
3273                 return NULL;
3274         }
3275
3276         return iova;
3277 }
3278
3279 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3280 {
3281         struct dmar_domain *domain;
3282         int ret;
3283
3284         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3285         if (!domain) {
3286                 pr_err("Allocating domain for %s failed\n",
3287                        dev_name(dev));
3288                 return NULL;
3289         }
3290
3291         /* make sure context mapping is ok */
3292         if (unlikely(!domain_context_mapped(dev))) {
3293                 ret = domain_context_mapping(domain, dev);
3294                 if (ret) {
3295                         pr_err("Domain context map for %s failed\n",
3296                                dev_name(dev));
3297                         return NULL;
3298                 }
3299         }
3300
3301         return domain;
3302 }
3303
3304 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3305 {
3306         struct device_domain_info *info;
3307
3308         /* No lock here, assumes no domain exit in normal case */
3309         info = dev->archdata.iommu;
3310         if (likely(info))
3311                 return info->domain;
3312
3313         return __get_valid_domain_for_dev(dev);
3314 }
3315
3316 /* Check if the dev needs to go through non-identity map and unmap process.*/
3317 static int iommu_no_mapping(struct device *dev)
3318 {
3319         int found;
3320
3321         if (iommu_dummy(dev))
3322                 return 1;
3323
3324         if (!iommu_identity_mapping)
3325                 return 0;
3326
3327         found = identity_mapping(dev);
3328         if (found) {
3329                 if (iommu_should_identity_map(dev, 0))
3330                         return 1;
3331                 else {
3332                         /*
3333                          * 32 bit DMA is removed from si_domain and fall back
3334                          * to non-identity mapping.
3335                          */
3336                         domain_remove_one_dev_info(si_domain, dev);
3337                         pr_info("32bit %s uses non-identity mapping\n",
3338                                 dev_name(dev));
3339                         return 0;
3340                 }
3341         } else {
3342                 /*
3343                  * In case of a detached 64 bit DMA device from vm, the device
3344                  * is put into si_domain for identity mapping.
3345                  */
3346                 if (iommu_should_identity_map(dev, 0)) {
3347                         int ret;
3348                         ret = domain_add_dev_info(si_domain, dev);
3349                         if (!ret) {
3350                                 pr_info("64bit %s uses identity mapping\n",
3351                                         dev_name(dev));
3352                                 return 1;
3353                         }
3354                 }
3355         }
3356
3357         return 0;
3358 }
3359
3360 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3361                                      size_t size, int dir, u64 dma_mask)
3362 {
3363         struct dmar_domain *domain;
3364         phys_addr_t start_paddr;
3365         struct iova *iova;
3366         int prot = 0;
3367         int ret;
3368         struct intel_iommu *iommu;
3369         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3370
3371         BUG_ON(dir == DMA_NONE);
3372
3373         if (iommu_no_mapping(dev))
3374                 return paddr;
3375
3376         domain = get_valid_domain_for_dev(dev);
3377         if (!domain)
3378                 return 0;
3379
3380         iommu = domain_get_iommu(domain);
3381         size = aligned_nrpages(paddr, size);
3382
3383         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3384         if (!iova)
3385                 goto error;
3386
3387         /*
3388          * Check if DMAR supports zero-length reads on write only
3389          * mappings..
3390          */
3391         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3392                         !cap_zlr(iommu->cap))
3393                 prot |= DMA_PTE_READ;
3394         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3395                 prot |= DMA_PTE_WRITE;
3396         /*
3397          * paddr - (paddr + size) might be partial page, we should map the whole
3398          * page.  Note: if two part of one page are separately mapped, we
3399          * might have two guest_addr mapping to the same host paddr, but this
3400          * is not a big problem
3401          */
3402         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3403                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3404         if (ret)
3405                 goto error;
3406
3407         /* it's a non-present to present mapping. Only flush if caching mode */
3408         if (cap_caching_mode(iommu->cap))
3409                 iommu_flush_iotlb_psi(iommu, domain,
3410                                       mm_to_dma_pfn(iova->pfn_lo),
3411                                       size, 0, 1);
3412         else
3413                 iommu_flush_write_buffer(iommu);
3414
3415         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3416         start_paddr += paddr & ~PAGE_MASK;
3417         return start_paddr;
3418
3419 error:
3420         if (iova)
3421                 __free_iova(&domain->iovad, iova);
3422         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3423                 dev_name(dev), size, (unsigned long long)paddr, dir);
3424         return 0;
3425 }
3426
3427 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3428                                  unsigned long offset, size_t size,
3429                                  enum dma_data_direction dir,
3430                                  struct dma_attrs *attrs)
3431 {
3432         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3433                                   dir, *dev->dma_mask);
3434 }
3435
3436 static void flush_unmaps(void)
3437 {
3438         int i, j;
3439
3440         timer_on = 0;
3441
3442         /* just flush them all */
3443         for (i = 0; i < g_num_of_iommus; i++) {
3444                 struct intel_iommu *iommu = g_iommus[i];
3445                 if (!iommu)
3446                         continue;
3447
3448                 if (!deferred_flush[i].next)
3449                         continue;
3450
3451                 /* In caching mode, global flushes turn emulation expensive */
3452                 if (!cap_caching_mode(iommu->cap))
3453                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3454                                          DMA_TLB_GLOBAL_FLUSH);
3455                 for (j = 0; j < deferred_flush[i].next; j++) {
3456                         unsigned long mask;
3457                         struct iova *iova = deferred_flush[i].iova[j];
3458                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3459
3460                         /* On real hardware multiple invalidations are expensive */
3461                         if (cap_caching_mode(iommu->cap))
3462                                 iommu_flush_iotlb_psi(iommu, domain,
3463                                         iova->pfn_lo, iova_size(iova),
3464                                         !deferred_flush[i].freelist[j], 0);
3465                         else {
3466                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3467                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3468                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3469                         }
3470                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3471                         if (deferred_flush[i].freelist[j])
3472                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3473                 }
3474                 deferred_flush[i].next = 0;
3475         }
3476
3477         list_size = 0;
3478 }
3479
3480 static void flush_unmaps_timeout(unsigned long data)
3481 {
3482         unsigned long flags;
3483
3484         spin_lock_irqsave(&async_umap_flush_lock, flags);
3485         flush_unmaps();
3486         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3487 }
3488
3489 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3490 {
3491         unsigned long flags;
3492         int next, iommu_id;
3493         struct intel_iommu *iommu;
3494
3495         spin_lock_irqsave(&async_umap_flush_lock, flags);
3496         if (list_size == HIGH_WATER_MARK)
3497                 flush_unmaps();
3498
3499         iommu = domain_get_iommu(dom);
3500         iommu_id = iommu->seq_id;
3501
3502         next = deferred_flush[iommu_id].next;
3503         deferred_flush[iommu_id].domain[next] = dom;
3504         deferred_flush[iommu_id].iova[next] = iova;
3505         deferred_flush[iommu_id].freelist[next] = freelist;
3506         deferred_flush[iommu_id].next++;
3507
3508         if (!timer_on) {
3509                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3510                 timer_on = 1;
3511         }
3512         list_size++;
3513         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3514 }
3515
3516 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3517 {
3518         struct dmar_domain *domain;
3519         unsigned long start_pfn, last_pfn;
3520         struct iova *iova;
3521         struct intel_iommu *iommu;
3522         struct page *freelist;
3523
3524         if (iommu_no_mapping(dev))
3525                 return;
3526
3527         domain = find_domain(dev);
3528         BUG_ON(!domain);
3529
3530         iommu = domain_get_iommu(domain);
3531
3532         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3533         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3534                       (unsigned long long)dev_addr))
3535                 return;
3536
3537         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3538         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3539
3540         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3541                  dev_name(dev), start_pfn, last_pfn);
3542
3543         freelist = domain_unmap(domain, start_pfn, last_pfn);
3544
3545         if (intel_iommu_strict) {
3546                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3547                                       last_pfn - start_pfn + 1, !freelist, 0);
3548                 /* free iova */
3549                 __free_iova(&domain->iovad, iova);
3550                 dma_free_pagelist(freelist);
3551         } else {
3552                 add_unmap(domain, iova, freelist);
3553                 /*
3554                  * queue up the release of the unmap to save the 1/6th of the
3555                  * cpu used up by the iotlb flush operation...
3556                  */
3557         }
3558 }
3559
3560 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3561                              size_t size, enum dma_data_direction dir,
3562                              struct dma_attrs *attrs)
3563 {
3564         intel_unmap(dev, dev_addr);
3565 }
3566
3567 static void *intel_alloc_coherent(struct device *dev, size_t size,
3568                                   dma_addr_t *dma_handle, gfp_t flags,
3569                                   struct dma_attrs *attrs)
3570 {
3571         struct page *page = NULL;
3572         int order;
3573
3574         size = PAGE_ALIGN(size);
3575         order = get_order(size);
3576
3577         if (!iommu_no_mapping(dev))
3578                 flags &= ~(GFP_DMA | GFP_DMA32);
3579         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3580                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3581                         flags |= GFP_DMA;
3582                 else
3583                         flags |= GFP_DMA32;
3584         }
3585
3586         if (flags & __GFP_WAIT) {
3587                 unsigned int count = size >> PAGE_SHIFT;
3588
3589                 page = dma_alloc_from_contiguous(dev, count, order);
3590                 if (page && iommu_no_mapping(dev) &&
3591                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3592                         dma_release_from_contiguous(dev, page, count);
3593                         page = NULL;
3594                 }
3595         }
3596
3597         if (!page)
3598                 page = alloc_pages(flags, order);
3599         if (!page)
3600                 return NULL;
3601         memset(page_address(page), 0, size);
3602
3603         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3604                                          DMA_BIDIRECTIONAL,
3605                                          dev->coherent_dma_mask);
3606         if (*dma_handle)
3607                 return page_address(page);
3608         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3609                 __free_pages(page, order);
3610
3611         return NULL;
3612 }
3613
3614 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3615                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3616 {
3617         int order;
3618         struct page *page = virt_to_page(vaddr);
3619
3620         size = PAGE_ALIGN(size);
3621         order = get_order(size);
3622
3623         intel_unmap(dev, dma_handle);
3624         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3625                 __free_pages(page, order);
3626 }
3627
3628 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3629                            int nelems, enum dma_data_direction dir,
3630                            struct dma_attrs *attrs)
3631 {
3632         intel_unmap(dev, sglist[0].dma_address);
3633 }
3634
3635 static int intel_nontranslate_map_sg(struct device *hddev,
3636         struct scatterlist *sglist, int nelems, int dir)
3637 {
3638         int i;
3639         struct scatterlist *sg;
3640
3641         for_each_sg(sglist, sg, nelems, i) {
3642                 BUG_ON(!sg_page(sg));
3643                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3644                 sg->dma_length = sg->length;
3645         }
3646         return nelems;
3647 }
3648
3649 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3650                         enum dma_data_direction dir, struct dma_attrs *attrs)
3651 {
3652         int i;
3653         struct dmar_domain *domain;
3654         size_t size = 0;
3655         int prot = 0;
3656         struct iova *iova = NULL;
3657         int ret;
3658         struct scatterlist *sg;
3659         unsigned long start_vpfn;
3660         struct intel_iommu *iommu;
3661
3662         BUG_ON(dir == DMA_NONE);
3663         if (iommu_no_mapping(dev))
3664                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3665
3666         domain = get_valid_domain_for_dev(dev);
3667         if (!domain)
3668                 return 0;
3669
3670         iommu = domain_get_iommu(domain);
3671
3672         for_each_sg(sglist, sg, nelems, i)
3673                 size += aligned_nrpages(sg->offset, sg->length);
3674
3675         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3676                                 *dev->dma_mask);
3677         if (!iova) {
3678                 sglist->dma_length = 0;
3679                 return 0;
3680         }
3681
3682         /*
3683          * Check if DMAR supports zero-length reads on write only
3684          * mappings..
3685          */
3686         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3687                         !cap_zlr(iommu->cap))
3688                 prot |= DMA_PTE_READ;
3689         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3690                 prot |= DMA_PTE_WRITE;
3691
3692         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3693
3694         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3695         if (unlikely(ret)) {
3696                 dma_pte_free_pagetable(domain, start_vpfn,
3697                                        start_vpfn + size - 1);
3698                 __free_iova(&domain->iovad, iova);
3699                 return 0;
3700         }
3701
3702         /* it's a non-present to present mapping. Only flush if caching mode */
3703         if (cap_caching_mode(iommu->cap))
3704                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3705         else
3706                 iommu_flush_write_buffer(iommu);
3707
3708         return nelems;
3709 }
3710
3711 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3712 {
3713         return !dma_addr;
3714 }
3715
3716 struct dma_map_ops intel_dma_ops = {
3717         .alloc = intel_alloc_coherent,
3718         .free = intel_free_coherent,
3719         .map_sg = intel_map_sg,
3720         .unmap_sg = intel_unmap_sg,
3721         .map_page = intel_map_page,
3722         .unmap_page = intel_unmap_page,
3723         .mapping_error = intel_mapping_error,
3724 };
3725
3726 static inline int iommu_domain_cache_init(void)
3727 {
3728         int ret = 0;
3729
3730         iommu_domain_cache = kmem_cache_create("iommu_domain",
3731                                          sizeof(struct dmar_domain),
3732                                          0,
3733                                          SLAB_HWCACHE_ALIGN,
3734
3735                                          NULL);
3736         if (!iommu_domain_cache) {
3737                 pr_err("Couldn't create iommu_domain cache\n");
3738                 ret = -ENOMEM;
3739         }
3740
3741         return ret;
3742 }
3743
3744 static inline int iommu_devinfo_cache_init(void)
3745 {
3746         int ret = 0;
3747
3748         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3749                                          sizeof(struct device_domain_info),
3750                                          0,
3751                                          SLAB_HWCACHE_ALIGN,
3752                                          NULL);
3753         if (!iommu_devinfo_cache) {
3754                 pr_err("Couldn't create devinfo cache\n");
3755                 ret = -ENOMEM;
3756         }
3757
3758         return ret;
3759 }
3760
3761 static int __init iommu_init_mempool(void)
3762 {
3763         int ret;
3764         ret = iommu_iova_cache_init();
3765         if (ret)
3766                 return ret;
3767
3768         ret = iommu_domain_cache_init();
3769         if (ret)
3770                 goto domain_error;
3771
3772         ret = iommu_devinfo_cache_init();
3773         if (!ret)
3774                 return ret;
3775
3776         kmem_cache_destroy(iommu_domain_cache);
3777 domain_error:
3778         iommu_iova_cache_destroy();
3779
3780         return -ENOMEM;
3781 }
3782
3783 static void __init iommu_exit_mempool(void)
3784 {
3785         kmem_cache_destroy(iommu_devinfo_cache);
3786         kmem_cache_destroy(iommu_domain_cache);
3787         iommu_iova_cache_destroy();
3788 }
3789
3790 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3791 {
3792         struct dmar_drhd_unit *drhd;
3793         u32 vtbar;
3794         int rc;
3795
3796         /* We know that this device on this chipset has its own IOMMU.
3797          * If we find it under a different IOMMU, then the BIOS is lying
3798          * to us. Hope that the IOMMU for this device is actually
3799          * disabled, and it needs no translation...
3800          */
3801         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3802         if (rc) {
3803                 /* "can't" happen */
3804                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3805                 return;
3806         }
3807         vtbar &= 0xffff0000;
3808
3809         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3810         drhd = dmar_find_matched_drhd_unit(pdev);
3811         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3812                             TAINT_FIRMWARE_WORKAROUND,
3813                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3814                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3815 }
3816 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3817
3818 static void __init init_no_remapping_devices(void)
3819 {
3820         struct dmar_drhd_unit *drhd;
3821         struct device *dev;
3822         int i;
3823
3824         for_each_drhd_unit(drhd) {
3825                 if (!drhd->include_all) {
3826                         for_each_active_dev_scope(drhd->devices,
3827                                                   drhd->devices_cnt, i, dev)
3828                                 break;
3829                         /* ignore DMAR unit if no devices exist */
3830                         if (i == drhd->devices_cnt)
3831                                 drhd->ignored = 1;
3832                 }
3833         }
3834
3835         for_each_active_drhd_unit(drhd) {
3836                 if (drhd->include_all)
3837                         continue;
3838
3839                 for_each_active_dev_scope(drhd->devices,
3840                                           drhd->devices_cnt, i, dev)
3841                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3842                                 break;
3843                 if (i < drhd->devices_cnt)
3844                         continue;
3845
3846                 /* This IOMMU has *only* gfx devices. Either bypass it or
3847                    set the gfx_mapped flag, as appropriate */
3848                 if (dmar_map_gfx) {
3849                         intel_iommu_gfx_mapped = 1;
3850                 } else {
3851                         drhd->ignored = 1;
3852                         for_each_active_dev_scope(drhd->devices,
3853                                                   drhd->devices_cnt, i, dev)
3854                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3855                 }
3856         }
3857 }
3858
3859 #ifdef CONFIG_SUSPEND
3860 static int init_iommu_hw(void)
3861 {
3862         struct dmar_drhd_unit *drhd;
3863         struct intel_iommu *iommu = NULL;
3864
3865         for_each_active_iommu(iommu, drhd)
3866                 if (iommu->qi)
3867                         dmar_reenable_qi(iommu);
3868
3869         for_each_iommu(iommu, drhd) {
3870                 if (drhd->ignored) {
3871                         /*
3872                          * we always have to disable PMRs or DMA may fail on
3873                          * this device
3874                          */
3875                         if (force_on)
3876                                 iommu_disable_protect_mem_regions(iommu);
3877                         continue;
3878                 }
3879         
3880                 iommu_flush_write_buffer(iommu);
3881
3882                 iommu_set_root_entry(iommu);
3883
3884                 iommu->flush.flush_context(iommu, 0, 0, 0,
3885                                            DMA_CCMD_GLOBAL_INVL);
3886                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3887                 iommu_enable_translation(iommu);
3888                 iommu_disable_protect_mem_regions(iommu);
3889         }
3890
3891         return 0;
3892 }
3893
3894 static void iommu_flush_all(void)
3895 {
3896         struct dmar_drhd_unit *drhd;
3897         struct intel_iommu *iommu;
3898
3899         for_each_active_iommu(iommu, drhd) {
3900                 iommu->flush.flush_context(iommu, 0, 0, 0,
3901                                            DMA_CCMD_GLOBAL_INVL);
3902                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3903                                          DMA_TLB_GLOBAL_FLUSH);
3904         }
3905 }
3906
3907 static int iommu_suspend(void)
3908 {
3909         struct dmar_drhd_unit *drhd;
3910         struct intel_iommu *iommu = NULL;
3911         unsigned long flag;
3912
3913         for_each_active_iommu(iommu, drhd) {
3914                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3915                                                  GFP_ATOMIC);
3916                 if (!iommu->iommu_state)
3917                         goto nomem;
3918         }
3919
3920         iommu_flush_all();
3921
3922         for_each_active_iommu(iommu, drhd) {
3923                 iommu_disable_translation(iommu);
3924
3925                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3926
3927                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3928                         readl(iommu->reg + DMAR_FECTL_REG);
3929                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3930                         readl(iommu->reg + DMAR_FEDATA_REG);
3931                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3932                         readl(iommu->reg + DMAR_FEADDR_REG);
3933                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3934                         readl(iommu->reg + DMAR_FEUADDR_REG);
3935
3936                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3937         }
3938         return 0;
3939
3940 nomem:
3941         for_each_active_iommu(iommu, drhd)
3942                 kfree(iommu->iommu_state);
3943
3944         return -ENOMEM;
3945 }
3946
3947 static void iommu_resume(void)
3948 {
3949         struct dmar_drhd_unit *drhd;
3950         struct intel_iommu *iommu = NULL;
3951         unsigned long flag;
3952
3953         if (init_iommu_hw()) {
3954                 if (force_on)
3955                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3956                 else
3957                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3958                 return;
3959         }
3960
3961         for_each_active_iommu(iommu, drhd) {
3962
3963                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3964
3965                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3966                         iommu->reg + DMAR_FECTL_REG);
3967                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3968                         iommu->reg + DMAR_FEDATA_REG);
3969                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3970                         iommu->reg + DMAR_FEADDR_REG);
3971                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3972                         iommu->reg + DMAR_FEUADDR_REG);
3973
3974                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3975         }
3976
3977         for_each_active_iommu(iommu, drhd)
3978                 kfree(iommu->iommu_state);
3979 }
3980
3981 static struct syscore_ops iommu_syscore_ops = {
3982         .resume         = iommu_resume,
3983         .suspend        = iommu_suspend,
3984 };
3985
3986 static void __init init_iommu_pm_ops(void)
3987 {
3988         register_syscore_ops(&iommu_syscore_ops);
3989 }
3990
3991 #else
3992 static inline void init_iommu_pm_ops(void) {}
3993 #endif  /* CONFIG_PM */
3994
3995
3996 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3997 {
3998         struct acpi_dmar_reserved_memory *rmrr;
3999         struct dmar_rmrr_unit *rmrru;
4000
4001         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4002         if (!rmrru)
4003                 return -ENOMEM;
4004
4005         rmrru->hdr = header;
4006         rmrr = (struct acpi_dmar_reserved_memory *)header;
4007         rmrru->base_address = rmrr->base_address;
4008         rmrru->end_address = rmrr->end_address;
4009         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4010                                 ((void *)rmrr) + rmrr->header.length,
4011                                 &rmrru->devices_cnt);
4012         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4013                 kfree(rmrru);
4014                 return -ENOMEM;
4015         }
4016
4017         list_add(&rmrru->list, &dmar_rmrr_units);
4018
4019         return 0;
4020 }
4021
4022 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4023 {
4024         struct dmar_atsr_unit *atsru;
4025         struct acpi_dmar_atsr *tmp;
4026
4027         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4028                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4029                 if (atsr->segment != tmp->segment)
4030                         continue;
4031                 if (atsr->header.length != tmp->header.length)
4032                         continue;
4033                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4034                         return atsru;
4035         }
4036
4037         return NULL;
4038 }
4039
4040 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4041 {
4042         struct acpi_dmar_atsr *atsr;
4043         struct dmar_atsr_unit *atsru;
4044
4045         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4046                 return 0;
4047
4048         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4049         atsru = dmar_find_atsr(atsr);
4050         if (atsru)
4051                 return 0;
4052
4053         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4054         if (!atsru)
4055                 return -ENOMEM;
4056
4057         /*
4058          * If memory is allocated from slab by ACPI _DSM method, we need to
4059          * copy the memory content because the memory buffer will be freed
4060          * on return.
4061          */
4062         atsru->hdr = (void *)(atsru + 1);
4063         memcpy(atsru->hdr, hdr, hdr->length);
4064         atsru->include_all = atsr->flags & 0x1;
4065         if (!atsru->include_all) {
4066                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4067                                 (void *)atsr + atsr->header.length,
4068                                 &atsru->devices_cnt);
4069                 if (atsru->devices_cnt && atsru->devices == NULL) {
4070                         kfree(atsru);
4071                         return -ENOMEM;
4072                 }
4073         }
4074
4075         list_add_rcu(&atsru->list, &dmar_atsr_units);
4076
4077         return 0;
4078 }
4079
4080 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4081 {
4082         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4083         kfree(atsru);
4084 }
4085
4086 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4087 {
4088         struct acpi_dmar_atsr *atsr;
4089         struct dmar_atsr_unit *atsru;
4090
4091         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4092         atsru = dmar_find_atsr(atsr);
4093         if (atsru) {
4094                 list_del_rcu(&atsru->list);
4095                 synchronize_rcu();
4096                 intel_iommu_free_atsr(atsru);
4097         }
4098
4099         return 0;
4100 }
4101
4102 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4103 {
4104         int i;
4105         struct device *dev;
4106         struct acpi_dmar_atsr *atsr;
4107         struct dmar_atsr_unit *atsru;
4108
4109         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4110         atsru = dmar_find_atsr(atsr);
4111         if (!atsru)
4112                 return 0;
4113
4114         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4115                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4116                                           i, dev)
4117                         return -EBUSY;
4118
4119         return 0;
4120 }
4121
4122 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4123 {
4124         int sp, ret = 0;
4125         struct intel_iommu *iommu = dmaru->iommu;
4126
4127         if (g_iommus[iommu->seq_id])
4128                 return 0;
4129
4130         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4131                 pr_warn("%s: Doesn't support hardware pass through.\n",
4132                         iommu->name);
4133                 return -ENXIO;
4134         }
4135         if (!ecap_sc_support(iommu->ecap) &&
4136             domain_update_iommu_snooping(iommu)) {
4137                 pr_warn("%s: Doesn't support snooping.\n",
4138                         iommu->name);
4139                 return -ENXIO;
4140         }
4141         sp = domain_update_iommu_superpage(iommu) - 1;
4142         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4143                 pr_warn("%s: Doesn't support large page.\n",
4144                         iommu->name);
4145                 return -ENXIO;
4146         }
4147
4148         /*
4149          * Disable translation if already enabled prior to OS handover.
4150          */
4151         if (iommu->gcmd & DMA_GCMD_TE)
4152                 iommu_disable_translation(iommu);
4153
4154         g_iommus[iommu->seq_id] = iommu;
4155         ret = iommu_init_domains(iommu);
4156         if (ret == 0)
4157                 ret = iommu_alloc_root_entry(iommu);
4158         if (ret)
4159                 goto out;
4160
4161         if (dmaru->ignored) {
4162                 /*
4163                  * we always have to disable PMRs or DMA may fail on this device
4164                  */
4165                 if (force_on)
4166                         iommu_disable_protect_mem_regions(iommu);
4167                 return 0;
4168         }
4169
4170         intel_iommu_init_qi(iommu);
4171         iommu_flush_write_buffer(iommu);
4172         ret = dmar_set_interrupt(iommu);
4173         if (ret)
4174                 goto disable_iommu;
4175
4176         iommu_set_root_entry(iommu);
4177         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4178         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4179         iommu_enable_translation(iommu);
4180
4181         iommu_disable_protect_mem_regions(iommu);
4182         return 0;
4183
4184 disable_iommu:
4185         disable_dmar_iommu(iommu);
4186 out:
4187         free_dmar_iommu(iommu);
4188         return ret;
4189 }
4190
4191 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4192 {
4193         int ret = 0;
4194         struct intel_iommu *iommu = dmaru->iommu;
4195
4196         if (!intel_iommu_enabled)
4197                 return 0;
4198         if (iommu == NULL)
4199                 return -EINVAL;
4200
4201         if (insert) {
4202                 ret = intel_iommu_add(dmaru);
4203         } else {
4204                 disable_dmar_iommu(iommu);
4205                 free_dmar_iommu(iommu);
4206         }
4207
4208         return ret;
4209 }
4210
4211 static void intel_iommu_free_dmars(void)
4212 {
4213         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4214         struct dmar_atsr_unit *atsru, *atsr_n;
4215
4216         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4217                 list_del(&rmrru->list);
4218                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4219                 kfree(rmrru);
4220         }
4221
4222         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4223                 list_del(&atsru->list);
4224                 intel_iommu_free_atsr(atsru);
4225         }
4226 }
4227
4228 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4229 {
4230         int i, ret = 1;
4231         struct pci_bus *bus;
4232         struct pci_dev *bridge = NULL;
4233         struct device *tmp;
4234         struct acpi_dmar_atsr *atsr;
4235         struct dmar_atsr_unit *atsru;
4236
4237         dev = pci_physfn(dev);
4238         for (bus = dev->bus; bus; bus = bus->parent) {
4239                 bridge = bus->self;
4240                 if (!bridge || !pci_is_pcie(bridge) ||
4241                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4242                         return 0;
4243                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4244                         break;
4245         }
4246         if (!bridge)
4247                 return 0;
4248
4249         rcu_read_lock();
4250         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4251                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4252                 if (atsr->segment != pci_domain_nr(dev->bus))
4253                         continue;
4254
4255                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4256                         if (tmp == &bridge->dev)
4257                                 goto out;
4258
4259                 if (atsru->include_all)
4260                         goto out;
4261         }
4262         ret = 0;
4263 out:
4264         rcu_read_unlock();
4265
4266         return ret;
4267 }
4268
4269 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4270 {
4271         int ret = 0;
4272         struct dmar_rmrr_unit *rmrru;
4273         struct dmar_atsr_unit *atsru;
4274         struct acpi_dmar_atsr *atsr;
4275         struct acpi_dmar_reserved_memory *rmrr;
4276
4277         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4278                 return 0;
4279
4280         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4281                 rmrr = container_of(rmrru->hdr,
4282                                     struct acpi_dmar_reserved_memory, header);
4283                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4284                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4285                                 ((void *)rmrr) + rmrr->header.length,
4286                                 rmrr->segment, rmrru->devices,
4287                                 rmrru->devices_cnt);
4288                         if(ret < 0)
4289                                 return ret;
4290                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4291                         dmar_remove_dev_scope(info, rmrr->segment,
4292                                 rmrru->devices, rmrru->devices_cnt);
4293                 }
4294         }
4295
4296         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4297                 if (atsru->include_all)
4298                         continue;
4299
4300                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4301                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4302                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4303                                         (void *)atsr + atsr->header.length,
4304                                         atsr->segment, atsru->devices,
4305                                         atsru->devices_cnt);
4306                         if (ret > 0)
4307                                 break;
4308                         else if(ret < 0)
4309                                 return ret;
4310                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4311                         if (dmar_remove_dev_scope(info, atsr->segment,
4312                                         atsru->devices, atsru->devices_cnt))
4313                                 break;
4314                 }
4315         }
4316
4317         return 0;
4318 }
4319
4320 /*
4321  * Here we only respond to action of unbound device from driver.
4322  *
4323  * Added device is not attached to its DMAR domain here yet. That will happen
4324  * when mapping the device to iova.
4325  */
4326 static int device_notifier(struct notifier_block *nb,
4327                                   unsigned long action, void *data)
4328 {
4329         struct device *dev = data;
4330         struct dmar_domain *domain;
4331
4332         if (iommu_dummy(dev))
4333                 return 0;
4334
4335         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4336                 return 0;
4337
4338         domain = find_domain(dev);
4339         if (!domain)
4340                 return 0;
4341
4342         down_read(&dmar_global_lock);
4343         domain_remove_one_dev_info(domain, dev);
4344         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4345                 domain_exit(domain);
4346         up_read(&dmar_global_lock);
4347
4348         return 0;
4349 }
4350
4351 static struct notifier_block device_nb = {
4352         .notifier_call = device_notifier,
4353 };
4354
4355 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4356                                        unsigned long val, void *v)
4357 {
4358         struct memory_notify *mhp = v;
4359         unsigned long long start, end;
4360         unsigned long start_vpfn, last_vpfn;
4361
4362         switch (val) {
4363         case MEM_GOING_ONLINE:
4364                 start = mhp->start_pfn << PAGE_SHIFT;
4365                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4366                 if (iommu_domain_identity_map(si_domain, start, end)) {
4367                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4368                                 start, end);
4369                         return NOTIFY_BAD;
4370                 }
4371                 break;
4372
4373         case MEM_OFFLINE:
4374         case MEM_CANCEL_ONLINE:
4375                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4376                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4377                 while (start_vpfn <= last_vpfn) {
4378                         struct iova *iova;
4379                         struct dmar_drhd_unit *drhd;
4380                         struct intel_iommu *iommu;
4381                         struct page *freelist;
4382
4383                         iova = find_iova(&si_domain->iovad, start_vpfn);
4384                         if (iova == NULL) {
4385                                 pr_debug("Failed get IOVA for PFN %lx\n",
4386                                          start_vpfn);
4387                                 break;
4388                         }
4389
4390                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4391                                                      start_vpfn, last_vpfn);
4392                         if (iova == NULL) {
4393                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4394                                         start_vpfn, last_vpfn);
4395                                 return NOTIFY_BAD;
4396                         }
4397
4398                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4399                                                iova->pfn_hi);
4400
4401                         rcu_read_lock();
4402                         for_each_active_iommu(iommu, drhd)
4403                                 iommu_flush_iotlb_psi(iommu, si_domain,
4404                                         iova->pfn_lo, iova_size(iova),
4405                                         !freelist, 0);
4406                         rcu_read_unlock();
4407                         dma_free_pagelist(freelist);
4408
4409                         start_vpfn = iova->pfn_hi + 1;
4410                         free_iova_mem(iova);
4411                 }
4412                 break;
4413         }
4414
4415         return NOTIFY_OK;
4416 }
4417
4418 static struct notifier_block intel_iommu_memory_nb = {
4419         .notifier_call = intel_iommu_memory_notifier,
4420         .priority = 0
4421 };
4422
4423
4424 static ssize_t intel_iommu_show_version(struct device *dev,
4425                                         struct device_attribute *attr,
4426                                         char *buf)
4427 {
4428         struct intel_iommu *iommu = dev_get_drvdata(dev);
4429         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4430         return sprintf(buf, "%d:%d\n",
4431                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4432 }
4433 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4434
4435 static ssize_t intel_iommu_show_address(struct device *dev,
4436                                         struct device_attribute *attr,
4437                                         char *buf)
4438 {
4439         struct intel_iommu *iommu = dev_get_drvdata(dev);
4440         return sprintf(buf, "%llx\n", iommu->reg_phys);
4441 }
4442 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4443
4444 static ssize_t intel_iommu_show_cap(struct device *dev,
4445                                     struct device_attribute *attr,
4446                                     char *buf)
4447 {
4448         struct intel_iommu *iommu = dev_get_drvdata(dev);
4449         return sprintf(buf, "%llx\n", iommu->cap);
4450 }
4451 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4452
4453 static ssize_t intel_iommu_show_ecap(struct device *dev,
4454                                     struct device_attribute *attr,
4455                                     char *buf)
4456 {
4457         struct intel_iommu *iommu = dev_get_drvdata(dev);
4458         return sprintf(buf, "%llx\n", iommu->ecap);
4459 }
4460 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4461
4462 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4463                                       struct device_attribute *attr,
4464                                       char *buf)
4465 {
4466         struct intel_iommu *iommu = dev_get_drvdata(dev);
4467         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4468 }
4469 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4470
4471 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4472                                            struct device_attribute *attr,
4473                                            char *buf)
4474 {
4475         struct intel_iommu *iommu = dev_get_drvdata(dev);
4476         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4477                                                   cap_ndoms(iommu->cap)));
4478 }
4479 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4480
4481 static struct attribute *intel_iommu_attrs[] = {
4482         &dev_attr_version.attr,
4483         &dev_attr_address.attr,
4484         &dev_attr_cap.attr,
4485         &dev_attr_ecap.attr,
4486         &dev_attr_domains_supported.attr,
4487         &dev_attr_domains_used.attr,
4488         NULL,
4489 };
4490
4491 static struct attribute_group intel_iommu_group = {
4492         .name = "intel-iommu",
4493         .attrs = intel_iommu_attrs,
4494 };
4495
4496 const struct attribute_group *intel_iommu_groups[] = {
4497         &intel_iommu_group,
4498         NULL,
4499 };
4500
4501 int __init intel_iommu_init(void)
4502 {
4503         int ret = -ENODEV;
4504         struct dmar_drhd_unit *drhd;
4505         struct intel_iommu *iommu;
4506
4507         /* VT-d is required for a TXT/tboot launch, so enforce that */
4508         force_on = tboot_force_iommu();
4509
4510         if (iommu_init_mempool()) {
4511                 if (force_on)
4512                         panic("tboot: Failed to initialize iommu memory\n");
4513                 return -ENOMEM;
4514         }
4515
4516         down_write(&dmar_global_lock);
4517         if (dmar_table_init()) {
4518                 if (force_on)
4519                         panic("tboot: Failed to initialize DMAR table\n");
4520                 goto out_free_dmar;
4521         }
4522
4523         if (dmar_dev_scope_init() < 0) {
4524                 if (force_on)
4525                         panic("tboot: Failed to initialize DMAR device scope\n");
4526                 goto out_free_dmar;
4527         }
4528
4529         if (no_iommu || dmar_disabled)
4530                 goto out_free_dmar;
4531
4532         if (list_empty(&dmar_rmrr_units))
4533                 pr_info("No RMRR found\n");
4534
4535         if (list_empty(&dmar_atsr_units))
4536                 pr_info("No ATSR found\n");
4537
4538         if (dmar_init_reserved_ranges()) {
4539                 if (force_on)
4540                         panic("tboot: Failed to reserve iommu ranges\n");
4541                 goto out_free_reserved_range;
4542         }
4543
4544         init_no_remapping_devices();
4545
4546         ret = init_dmars();
4547         if (ret) {
4548                 if (force_on)
4549                         panic("tboot: Failed to initialize DMARs\n");
4550                 pr_err("Initialization failed\n");
4551                 goto out_free_reserved_range;
4552         }
4553         up_write(&dmar_global_lock);
4554         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4555
4556         init_timer(&unmap_timer);
4557 #ifdef CONFIG_SWIOTLB
4558         swiotlb = 0;
4559 #endif
4560         dma_ops = &intel_dma_ops;
4561
4562         init_iommu_pm_ops();
4563
4564         for_each_active_iommu(iommu, drhd)
4565                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4566                                                        intel_iommu_groups,
4567                                                        "%s", iommu->name);
4568
4569         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4570         bus_register_notifier(&pci_bus_type, &device_nb);
4571         if (si_domain && !hw_pass_through)
4572                 register_memory_notifier(&intel_iommu_memory_nb);
4573
4574         intel_iommu_enabled = 1;
4575
4576         return 0;
4577
4578 out_free_reserved_range:
4579         put_iova_domain(&reserved_iova_list);
4580 out_free_dmar:
4581         intel_iommu_free_dmars();
4582         up_write(&dmar_global_lock);
4583         iommu_exit_mempool();
4584         return ret;
4585 }
4586
4587 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4588 {
4589         struct intel_iommu *iommu = opaque;
4590
4591         iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4592         return 0;
4593 }
4594
4595 /*
4596  * NB - intel-iommu lacks any sort of reference counting for the users of
4597  * dependent devices.  If multiple endpoints have intersecting dependent
4598  * devices, unbinding the driver from any one of them will possibly leave
4599  * the others unable to operate.
4600  */
4601 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4602                                            struct device *dev)
4603 {
4604         if (!iommu || !dev || !dev_is_pci(dev))
4605                 return;
4606
4607         pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4608 }
4609
4610 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4611                                        struct device *dev)
4612 {
4613         struct device_domain_info *info, *tmp;
4614         struct intel_iommu *iommu;
4615         unsigned long flags;
4616         bool found = false;
4617         u8 bus, devfn;
4618
4619         iommu = device_to_iommu(dev, &bus, &devfn);
4620         if (!iommu)
4621                 return;
4622
4623         spin_lock_irqsave(&device_domain_lock, flags);
4624         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4625                 if (info->iommu == iommu && info->bus == bus &&
4626                     info->devfn == devfn) {
4627                         unlink_domain_info(info);
4628                         spin_unlock_irqrestore(&device_domain_lock, flags);
4629
4630                         iommu_disable_dev_iotlb(info);
4631                         iommu_detach_dev(iommu, info->bus, info->devfn);
4632                         iommu_detach_dependent_devices(iommu, dev);
4633                         free_devinfo_mem(info);
4634
4635                         spin_lock_irqsave(&device_domain_lock, flags);
4636
4637                         if (found)
4638                                 break;
4639                         else
4640                                 continue;
4641                 }
4642
4643                 /* if there is no other devices under the same iommu
4644                  * owned by this domain, clear this iommu in iommu_bmp
4645                  * update iommu count and coherency
4646                  */
4647                 if (info->iommu == iommu)
4648                         found = true;
4649         }
4650
4651         spin_unlock_irqrestore(&device_domain_lock, flags);
4652
4653         if (found == 0) {
4654                 domain_detach_iommu(domain, iommu);
4655                 if (!domain_type_is_vm_or_si(domain))
4656                         iommu_detach_domain(domain, iommu);
4657         }
4658 }
4659
4660 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4661 {
4662         int adjust_width;
4663
4664         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4665                         DMA_32BIT_PFN);
4666         domain_reserve_special_ranges(domain);
4667
4668         /* calculate AGAW */
4669         domain->gaw = guest_width;
4670         adjust_width = guestwidth_to_adjustwidth(guest_width);
4671         domain->agaw = width_to_agaw(adjust_width);
4672
4673         domain->iommu_coherency = 0;
4674         domain->iommu_snooping = 0;
4675         domain->iommu_superpage = 0;
4676         domain->max_addr = 0;
4677
4678         /* always allocate the top pgd */
4679         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4680         if (!domain->pgd)
4681                 return -ENOMEM;
4682         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4683         return 0;
4684 }
4685
4686 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4687 {
4688         struct dmar_domain *dmar_domain;
4689         struct iommu_domain *domain;
4690
4691         if (type != IOMMU_DOMAIN_UNMANAGED)
4692                 return NULL;
4693
4694         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4695         if (!dmar_domain) {
4696                 pr_err("Can't allocate dmar_domain\n");
4697                 return NULL;
4698         }
4699         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4700                 pr_err("Domain initialization failed\n");
4701                 domain_exit(dmar_domain);
4702                 return NULL;
4703         }
4704         domain_update_iommu_cap(dmar_domain);
4705
4706         domain = &dmar_domain->domain;
4707         domain->geometry.aperture_start = 0;
4708         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4709         domain->geometry.force_aperture = true;
4710
4711         return domain;
4712 }
4713
4714 static void intel_iommu_domain_free(struct iommu_domain *domain)
4715 {
4716         domain_exit(to_dmar_domain(domain));
4717 }
4718
4719 static int intel_iommu_attach_device(struct iommu_domain *domain,
4720                                      struct device *dev)
4721 {
4722         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4723         struct intel_iommu *iommu;
4724         int addr_width;
4725         u8 bus, devfn;
4726
4727         if (device_is_rmrr_locked(dev)) {
4728                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4729                 return -EPERM;
4730         }
4731
4732         /* normally dev is not mapped */
4733         if (unlikely(domain_context_mapped(dev))) {
4734                 struct dmar_domain *old_domain;
4735
4736                 old_domain = find_domain(dev);
4737                 if (old_domain) {
4738                         if (domain_type_is_vm_or_si(dmar_domain))
4739                                 domain_remove_one_dev_info(old_domain, dev);
4740                         else
4741                                 domain_remove_dev_info(old_domain);
4742
4743                         if (!domain_type_is_vm_or_si(old_domain) &&
4744                              list_empty(&old_domain->devices))
4745                                 domain_exit(old_domain);
4746                 }
4747         }
4748
4749         iommu = device_to_iommu(dev, &bus, &devfn);
4750         if (!iommu)
4751                 return -ENODEV;
4752
4753         /* check if this iommu agaw is sufficient for max mapped address */
4754         addr_width = agaw_to_width(iommu->agaw);
4755         if (addr_width > cap_mgaw(iommu->cap))
4756                 addr_width = cap_mgaw(iommu->cap);
4757
4758         if (dmar_domain->max_addr > (1LL << addr_width)) {
4759                 pr_err("%s: iommu width (%d) is not "
4760                        "sufficient for the mapped address (%llx)\n",
4761                        __func__, addr_width, dmar_domain->max_addr);
4762                 return -EFAULT;
4763         }
4764         dmar_domain->gaw = addr_width;
4765
4766         /*
4767          * Knock out extra levels of page tables if necessary
4768          */
4769         while (iommu->agaw < dmar_domain->agaw) {
4770                 struct dma_pte *pte;
4771
4772                 pte = dmar_domain->pgd;
4773                 if (dma_pte_present(pte)) {
4774                         dmar_domain->pgd = (struct dma_pte *)
4775                                 phys_to_virt(dma_pte_addr(pte));
4776                         free_pgtable_page(pte);
4777                 }
4778                 dmar_domain->agaw--;
4779         }
4780
4781         return domain_add_dev_info(dmar_domain, dev);
4782 }
4783
4784 static void intel_iommu_detach_device(struct iommu_domain *domain,
4785                                       struct device *dev)
4786 {
4787         domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4788 }
4789
4790 static int intel_iommu_map(struct iommu_domain *domain,
4791                            unsigned long iova, phys_addr_t hpa,
4792                            size_t size, int iommu_prot)
4793 {
4794         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4795         u64 max_addr;
4796         int prot = 0;
4797         int ret;
4798
4799         if (iommu_prot & IOMMU_READ)
4800                 prot |= DMA_PTE_READ;
4801         if (iommu_prot & IOMMU_WRITE)
4802                 prot |= DMA_PTE_WRITE;
4803         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4804                 prot |= DMA_PTE_SNP;
4805
4806         max_addr = iova + size;
4807         if (dmar_domain->max_addr < max_addr) {
4808                 u64 end;
4809
4810                 /* check if minimum agaw is sufficient for mapped address */
4811                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4812                 if (end < max_addr) {
4813                         pr_err("%s: iommu width (%d) is not "
4814                                "sufficient for the mapped address (%llx)\n",
4815                                __func__, dmar_domain->gaw, max_addr);
4816                         return -EFAULT;
4817                 }
4818                 dmar_domain->max_addr = max_addr;
4819         }
4820         /* Round up size to next multiple of PAGE_SIZE, if it and
4821            the low bits of hpa would take us onto the next page */
4822         size = aligned_nrpages(hpa, size);
4823         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4824                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4825         return ret;
4826 }
4827
4828 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4829                                 unsigned long iova, size_t size)
4830 {
4831         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4832         struct page *freelist = NULL;
4833         struct intel_iommu *iommu;
4834         unsigned long start_pfn, last_pfn;
4835         unsigned int npages;
4836         int iommu_id, num, ndomains, level = 0;
4837
4838         /* Cope with horrid API which requires us to unmap more than the
4839            size argument if it happens to be a large-page mapping. */
4840         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4841                 BUG();
4842
4843         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4844                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4845
4846         start_pfn = iova >> VTD_PAGE_SHIFT;
4847         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4848
4849         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4850
4851         npages = last_pfn - start_pfn + 1;
4852
4853         for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4854                 iommu = g_iommus[iommu_id];
4855
4856                 /*
4857                  * find bit position of dmar_domain
4858                  */
4859                 ndomains = cap_ndoms(iommu->cap);
4860                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4861                         if (get_iommu_domain(iommu, num) == dmar_domain)
4862                                 iommu_flush_iotlb_psi(iommu, dmar_domain,
4863                                                       start_pfn, npages,
4864                                                       !freelist, 0);
4865                 }
4866
4867         }
4868
4869         dma_free_pagelist(freelist);
4870
4871         if (dmar_domain->max_addr == iova + size)
4872                 dmar_domain->max_addr = iova;
4873
4874         return size;
4875 }
4876
4877 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4878                                             dma_addr_t iova)
4879 {
4880         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4881         struct dma_pte *pte;
4882         int level = 0;
4883         u64 phys = 0;
4884
4885         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4886         if (pte)
4887                 phys = dma_pte_addr(pte);
4888
4889         return phys;
4890 }
4891
4892 static bool intel_iommu_capable(enum iommu_cap cap)
4893 {
4894         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4895                 return domain_update_iommu_snooping(NULL) == 1;
4896         if (cap == IOMMU_CAP_INTR_REMAP)
4897                 return irq_remapping_enabled == 1;
4898
4899         return false;
4900 }
4901
4902 static int intel_iommu_add_device(struct device *dev)
4903 {
4904         struct intel_iommu *iommu;
4905         struct iommu_group *group;
4906         u8 bus, devfn;
4907
4908         iommu = device_to_iommu(dev, &bus, &devfn);
4909         if (!iommu)
4910                 return -ENODEV;
4911
4912         iommu_device_link(iommu->iommu_dev, dev);
4913
4914         group = iommu_group_get_for_dev(dev);
4915
4916         if (IS_ERR(group))
4917                 return PTR_ERR(group);
4918
4919         iommu_group_put(group);
4920         return 0;
4921 }
4922
4923 static void intel_iommu_remove_device(struct device *dev)
4924 {
4925         struct intel_iommu *iommu;
4926         u8 bus, devfn;
4927
4928         iommu = device_to_iommu(dev, &bus, &devfn);
4929         if (!iommu)
4930                 return;
4931
4932         iommu_group_remove_device(dev);
4933
4934         iommu_device_unlink(iommu->iommu_dev, dev);
4935 }
4936
4937 static const struct iommu_ops intel_iommu_ops = {
4938         .capable        = intel_iommu_capable,
4939         .domain_alloc   = intel_iommu_domain_alloc,
4940         .domain_free    = intel_iommu_domain_free,
4941         .attach_dev     = intel_iommu_attach_device,
4942         .detach_dev     = intel_iommu_detach_device,
4943         .map            = intel_iommu_map,
4944         .unmap          = intel_iommu_unmap,
4945         .map_sg         = default_iommu_map_sg,
4946         .iova_to_phys   = intel_iommu_iova_to_phys,
4947         .add_device     = intel_iommu_add_device,
4948         .remove_device  = intel_iommu_remove_device,
4949         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4950 };
4951
4952 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4953 {
4954         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4955         pr_info("Disabling IOMMU for graphics on this chipset\n");
4956         dmar_map_gfx = 0;
4957 }
4958
4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4966
4967 static void quirk_iommu_rwbf(struct pci_dev *dev)
4968 {
4969         /*
4970          * Mobile 4 Series Chipset neglects to set RWBF capability,
4971          * but needs it. Same seems to hold for the desktop versions.
4972          */
4973         pr_info("Forcing write-buffer flush capability\n");
4974         rwbf_quirk = 1;
4975 }
4976
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4984
4985 #define GGC 0x52
4986 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4987 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4988 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4989 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4990 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4991 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4992 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4993 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4994
4995 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4996 {
4997         unsigned short ggc;
4998
4999         if (pci_read_config_word(dev, GGC, &ggc))
5000                 return;
5001
5002         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5003                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5004                 dmar_map_gfx = 0;
5005         } else if (dmar_map_gfx) {
5006                 /* we have to ensure the gfx device is idle before we flush */
5007                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5008                 intel_iommu_strict = 1;
5009        }
5010 }
5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5015
5016 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5017    ISOCH DMAR unit for the Azalia sound device, but not give it any
5018    TLB entries, which causes it to deadlock. Check for that.  We do
5019    this in a function called from init_dmars(), instead of in a PCI
5020    quirk, because we don't want to print the obnoxious "BIOS broken"
5021    message if VT-d is actually disabled.
5022 */
5023 static void __init check_tylersburg_isoch(void)
5024 {
5025         struct pci_dev *pdev;
5026         uint32_t vtisochctrl;
5027
5028         /* If there's no Azalia in the system anyway, forget it. */
5029         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5030         if (!pdev)
5031                 return;
5032         pci_dev_put(pdev);
5033
5034         /* System Management Registers. Might be hidden, in which case
5035            we can't do the sanity check. But that's OK, because the
5036            known-broken BIOSes _don't_ actually hide it, so far. */
5037         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5038         if (!pdev)
5039                 return;
5040
5041         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5042                 pci_dev_put(pdev);
5043                 return;
5044         }
5045
5046         pci_dev_put(pdev);
5047
5048         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5049         if (vtisochctrl & 1)
5050                 return;
5051
5052         /* Drop all bits other than the number of TLB entries */
5053         vtisochctrl &= 0x1c;
5054
5055         /* If we have the recommended number of TLB entries (16), fine. */
5056         if (vtisochctrl == 0x10)
5057                 return;
5058
5059         /* Zero TLB entries? You get to ride the short bus to school. */
5060         if (!vtisochctrl) {
5061                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5062                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5063                      dmi_get_system_info(DMI_BIOS_VENDOR),
5064                      dmi_get_system_info(DMI_BIOS_VERSION),
5065                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5066                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5067                 return;
5068         }
5069
5070         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5071                vtisochctrl);
5072 }