iommu/vt-d: Remove dmar_global_lock from device_notifier
[firefly-linux-kernel-4.4.55.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <linux/dma-contiguous.h>
46 #include <linux/crash_dump.h>
47 #include <asm/irq_remapping.h>
48 #include <asm/cacheflush.h>
49 #include <asm/iommu.h>
50
51 #include "irq_remapping.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
84 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
85
86 /* page table handling */
87 #define LEVEL_STRIDE            (9)
88 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
89
90 /*
91  * This bitmap is used to advertise the page sizes our hardware support
92  * to the IOMMU core, which will then use this information to split
93  * physically contiguous memory regions it is mapping into page sizes
94  * that we support.
95  *
96  * Traditionally the IOMMU core just handed us the mappings directly,
97  * after making sure the size is an order of a 4KiB page and that the
98  * mapping has natural alignment.
99  *
100  * To retain this behavior, we currently advertise that we support
101  * all page sizes that are an order of 4KiB.
102  *
103  * If at some point we'd like to utilize the IOMMU core's new behavior,
104  * we could change this to advertise the real page sizes we support.
105  */
106 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
107
108 static inline int agaw_to_level(int agaw)
109 {
110         return agaw + 2;
111 }
112
113 static inline int agaw_to_width(int agaw)
114 {
115         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 }
117
118 static inline int width_to_agaw(int width)
119 {
120         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 }
122
123 static inline unsigned int level_to_offset_bits(int level)
124 {
125         return (level - 1) * LEVEL_STRIDE;
126 }
127
128 static inline int pfn_level_offset(unsigned long pfn, int level)
129 {
130         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 }
132
133 static inline unsigned long level_mask(int level)
134 {
135         return -1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long level_size(int level)
139 {
140         return 1UL << level_to_offset_bits(level);
141 }
142
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
144 {
145         return (pfn + level_size(level) - 1) & level_mask(level);
146 }
147
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
149 {
150         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 }
152
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154    are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
156 {
157         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
161 {
162         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
163 }
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
165 {
166         return mm_to_dma_pfn(page_to_pfn(pg));
167 }
168 static inline unsigned long virt_to_dma_pfn(void *p)
169 {
170         return page_to_dma_pfn(virt_to_page(p));
171 }
172
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
175
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
178
179 /*
180  * set to 1 to panic kernel if can't successfully enable VT-d
181  * (used when kernel is launched w/ TXT)
182  */
183 static int force_on = 0;
184
185 /*
186  * 0: Present
187  * 1-11: Reserved
188  * 12-63: Context Ptr (12 - (haw-1))
189  * 64-127: Reserved
190  */
191 struct root_entry {
192         u64     lo;
193         u64     hi;
194 };
195 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
196
197 /*
198  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 {
203         if (!(re->lo & 1))
204                 return 0;
205
206         return re->lo & VTD_PAGE_MASK;
207 }
208
209 /*
210  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
211  * if marked present.
212  */
213 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 {
215         if (!(re->hi & 1))
216                 return 0;
217
218         return re->hi & VTD_PAGE_MASK;
219 }
220 /*
221  * low 64 bits:
222  * 0: present
223  * 1: fault processing disable
224  * 2-3: translation type
225  * 12-63: address space root
226  * high 64 bits:
227  * 0-2: address width
228  * 3-6: aval
229  * 8-23: domain id
230  */
231 struct context_entry {
232         u64 lo;
233         u64 hi;
234 };
235
236 static inline void context_clear_pasid_enable(struct context_entry *context)
237 {
238         context->lo &= ~(1ULL << 11);
239 }
240
241 static inline bool context_pasid_enabled(struct context_entry *context)
242 {
243         return !!(context->lo & (1ULL << 11));
244 }
245
246 static inline void context_set_copied(struct context_entry *context)
247 {
248         context->hi |= (1ull << 3);
249 }
250
251 static inline bool context_copied(struct context_entry *context)
252 {
253         return !!(context->hi & (1ULL << 3));
254 }
255
256 static inline bool __context_present(struct context_entry *context)
257 {
258         return (context->lo & 1);
259 }
260
261 static inline bool context_present(struct context_entry *context)
262 {
263         return context_pasid_enabled(context) ?
264              __context_present(context) :
265              __context_present(context) && !context_copied(context);
266 }
267
268 static inline void context_set_present(struct context_entry *context)
269 {
270         context->lo |= 1;
271 }
272
273 static inline void context_set_fault_enable(struct context_entry *context)
274 {
275         context->lo &= (((u64)-1) << 2) | 1;
276 }
277
278 static inline void context_set_translation_type(struct context_entry *context,
279                                                 unsigned long value)
280 {
281         context->lo &= (((u64)-1) << 4) | 3;
282         context->lo |= (value & 3) << 2;
283 }
284
285 static inline void context_set_address_root(struct context_entry *context,
286                                             unsigned long value)
287 {
288         context->lo &= ~VTD_PAGE_MASK;
289         context->lo |= value & VTD_PAGE_MASK;
290 }
291
292 static inline void context_set_address_width(struct context_entry *context,
293                                              unsigned long value)
294 {
295         context->hi |= value & 7;
296 }
297
298 static inline void context_set_domain_id(struct context_entry *context,
299                                          unsigned long value)
300 {
301         context->hi |= (value & ((1 << 16) - 1)) << 8;
302 }
303
304 static inline int context_domain_id(struct context_entry *c)
305 {
306         return((c->hi >> 8) & 0xffff);
307 }
308
309 static inline void context_clear_entry(struct context_entry *context)
310 {
311         context->lo = 0;
312         context->hi = 0;
313 }
314
315 /*
316  * 0: readable
317  * 1: writable
318  * 2-6: reserved
319  * 7: super page
320  * 8-10: available
321  * 11: snoop behavior
322  * 12-63: Host physcial address
323  */
324 struct dma_pte {
325         u64 val;
326 };
327
328 static inline void dma_clear_pte(struct dma_pte *pte)
329 {
330         pte->val = 0;
331 }
332
333 static inline u64 dma_pte_addr(struct dma_pte *pte)
334 {
335 #ifdef CONFIG_64BIT
336         return pte->val & VTD_PAGE_MASK;
337 #else
338         /* Must have a full atomic 64-bit read */
339         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
340 #endif
341 }
342
343 static inline bool dma_pte_present(struct dma_pte *pte)
344 {
345         return (pte->val & 3) != 0;
346 }
347
348 static inline bool dma_pte_superpage(struct dma_pte *pte)
349 {
350         return (pte->val & DMA_PTE_LARGE_PAGE);
351 }
352
353 static inline int first_pte_in_page(struct dma_pte *pte)
354 {
355         return !((unsigned long)pte & ~VTD_PAGE_MASK);
356 }
357
358 /*
359  * This domain is a statically identity mapping domain.
360  *      1. This domain creats a static 1:1 mapping to all usable memory.
361  *      2. It maps to each iommu if successful.
362  *      3. Each iommu mapps to this domain if successful.
363  */
364 static struct dmar_domain *si_domain;
365 static int hw_pass_through = 1;
366
367 /*
368  * Domain represents a virtual machine, more than one devices
369  * across iommus may be owned in one domain, e.g. kvm guest.
370  */
371 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
372
373 /* si_domain contains mulitple devices */
374 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
375
376 #define for_each_domain_iommu(idx, domain)                      \
377         for (idx = 0; idx < g_num_of_iommus; idx++)             \
378                 if (domain->iommu_refcnt[idx])
379
380 struct dmar_domain {
381         int     nid;                    /* node id */
382
383         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
384                                         /* Refcount of devices per iommu */
385
386
387         u16             iommu_did[DMAR_UNITS_SUPPORTED];
388                                         /* Domain ids per IOMMU. Use u16 since
389                                          * domain ids are 16 bit wide according
390                                          * to VT-d spec, section 9.3 */
391
392         struct list_head devices;       /* all devices' list */
393         struct iova_domain iovad;       /* iova's that belong to this domain */
394
395         struct dma_pte  *pgd;           /* virtual address */
396         int             gaw;            /* max guest address width */
397
398         /* adjusted guest address width, 0 is level 2 30-bit */
399         int             agaw;
400
401         int             flags;          /* flags to find out type of domain */
402
403         int             iommu_coherency;/* indicate coherency of iommu access */
404         int             iommu_snooping; /* indicate snooping control feature*/
405         int             iommu_count;    /* reference count of iommu */
406         int             iommu_superpage;/* Level of superpages supported:
407                                            0 == 4KiB (no superpages), 1 == 2MiB,
408                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
409         u64             max_addr;       /* maximum mapped address */
410
411         struct iommu_domain domain;     /* generic domain data structure for
412                                            iommu core */
413 };
414
415 /* PCI domain-device relationship */
416 struct device_domain_info {
417         struct list_head link;  /* link to domain siblings */
418         struct list_head global; /* link to global list */
419         u8 bus;                 /* PCI bus number */
420         u8 devfn;               /* PCI devfn number */
421         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
422         struct intel_iommu *iommu; /* IOMMU used by this device */
423         struct dmar_domain *domain; /* pointer to domain */
424 };
425
426 struct dmar_rmrr_unit {
427         struct list_head list;          /* list of rmrr units   */
428         struct acpi_dmar_header *hdr;   /* ACPI header          */
429         u64     base_address;           /* reserved base address*/
430         u64     end_address;            /* reserved end address */
431         struct dmar_dev_scope *devices; /* target devices */
432         int     devices_cnt;            /* target device count */
433 };
434
435 struct dmar_atsr_unit {
436         struct list_head list;          /* list of ATSR units */
437         struct acpi_dmar_header *hdr;   /* ACPI header */
438         struct dmar_dev_scope *devices; /* target devices */
439         int devices_cnt;                /* target device count */
440         u8 include_all:1;               /* include all ports */
441 };
442
443 static LIST_HEAD(dmar_atsr_units);
444 static LIST_HEAD(dmar_rmrr_units);
445
446 #define for_each_rmrr_units(rmrr) \
447         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
448
449 static void flush_unmaps_timeout(unsigned long data);
450
451 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
452
453 #define HIGH_WATER_MARK 250
454 struct deferred_flush_tables {
455         int next;
456         struct iova *iova[HIGH_WATER_MARK];
457         struct dmar_domain *domain[HIGH_WATER_MARK];
458         struct page *freelist[HIGH_WATER_MARK];
459 };
460
461 static struct deferred_flush_tables *deferred_flush;
462
463 /* bitmap for indexing intel_iommus */
464 static int g_num_of_iommus;
465
466 static DEFINE_SPINLOCK(async_umap_flush_lock);
467 static LIST_HEAD(unmaps_to_do);
468
469 static int timer_on;
470 static long list_size;
471
472 static void domain_exit(struct dmar_domain *domain);
473 static void domain_remove_dev_info(struct dmar_domain *domain);
474 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
475                                      struct device *dev);
476 static void domain_context_clear(struct intel_iommu *iommu,
477                                  struct device *dev);
478 static void __dmar_remove_one_dev_info(struct dmar_domain *domain,
479                                        struct device *dev);
480 static int domain_detach_iommu(struct dmar_domain *domain,
481                                struct intel_iommu *iommu);
482
483 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
484 int dmar_disabled = 0;
485 #else
486 int dmar_disabled = 1;
487 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
488
489 int intel_iommu_enabled = 0;
490 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
491
492 static int dmar_map_gfx = 1;
493 static int dmar_forcedac;
494 static int intel_iommu_strict;
495 static int intel_iommu_superpage = 1;
496 static int intel_iommu_ecs = 1;
497
498 /* We only actually use ECS when PASID support (on the new bit 40)
499  * is also advertised. Some early implementations — the ones with
500  * PASID support on bit 28 — have issues even when we *only* use
501  * extended root/context tables. */
502 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
503                             ecap_pasid(iommu->ecap))
504
505 int intel_iommu_gfx_mapped;
506 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
507
508 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
509 static DEFINE_SPINLOCK(device_domain_lock);
510 static LIST_HEAD(device_domain_list);
511
512 static const struct iommu_ops intel_iommu_ops;
513
514 static bool translation_pre_enabled(struct intel_iommu *iommu)
515 {
516         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
517 }
518
519 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
520 {
521         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
522 }
523
524 static void init_translation_status(struct intel_iommu *iommu)
525 {
526         u32 gsts;
527
528         gsts = readl(iommu->reg + DMAR_GSTS_REG);
529         if (gsts & DMA_GSTS_TES)
530                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
531 }
532
533 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
534 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
535 {
536         return container_of(dom, struct dmar_domain, domain);
537 }
538
539 static int __init intel_iommu_setup(char *str)
540 {
541         if (!str)
542                 return -EINVAL;
543         while (*str) {
544                 if (!strncmp(str, "on", 2)) {
545                         dmar_disabled = 0;
546                         pr_info("IOMMU enabled\n");
547                 } else if (!strncmp(str, "off", 3)) {
548                         dmar_disabled = 1;
549                         pr_info("IOMMU disabled\n");
550                 } else if (!strncmp(str, "igfx_off", 8)) {
551                         dmar_map_gfx = 0;
552                         pr_info("Disable GFX device mapping\n");
553                 } else if (!strncmp(str, "forcedac", 8)) {
554                         pr_info("Forcing DAC for PCI devices\n");
555                         dmar_forcedac = 1;
556                 } else if (!strncmp(str, "strict", 6)) {
557                         pr_info("Disable batched IOTLB flush\n");
558                         intel_iommu_strict = 1;
559                 } else if (!strncmp(str, "sp_off", 6)) {
560                         pr_info("Disable supported super page\n");
561                         intel_iommu_superpage = 0;
562                 } else if (!strncmp(str, "ecs_off", 7)) {
563                         printk(KERN_INFO
564                                 "Intel-IOMMU: disable extended context table support\n");
565                         intel_iommu_ecs = 0;
566                 }
567
568                 str += strcspn(str, ",");
569                 while (*str == ',')
570                         str++;
571         }
572         return 0;
573 }
574 __setup("intel_iommu=", intel_iommu_setup);
575
576 static struct kmem_cache *iommu_domain_cache;
577 static struct kmem_cache *iommu_devinfo_cache;
578
579 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
580 {
581         struct dmar_domain **domains;
582         int idx = did >> 8;
583
584         domains = iommu->domains[idx];
585         if (!domains)
586                 return NULL;
587
588         return domains[did & 0xff];
589 }
590
591 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
592                              struct dmar_domain *domain)
593 {
594         struct dmar_domain **domains;
595         int idx = did >> 8;
596
597         if (!iommu->domains[idx]) {
598                 size_t size = 256 * sizeof(struct dmar_domain *);
599                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
600         }
601
602         domains = iommu->domains[idx];
603         if (WARN_ON(!domains))
604                 return;
605         else
606                 domains[did & 0xff] = domain;
607 }
608
609 static inline void *alloc_pgtable_page(int node)
610 {
611         struct page *page;
612         void *vaddr = NULL;
613
614         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
615         if (page)
616                 vaddr = page_address(page);
617         return vaddr;
618 }
619
620 static inline void free_pgtable_page(void *vaddr)
621 {
622         free_page((unsigned long)vaddr);
623 }
624
625 static inline void *alloc_domain_mem(void)
626 {
627         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
628 }
629
630 static void free_domain_mem(void *vaddr)
631 {
632         kmem_cache_free(iommu_domain_cache, vaddr);
633 }
634
635 static inline void * alloc_devinfo_mem(void)
636 {
637         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
638 }
639
640 static inline void free_devinfo_mem(void *vaddr)
641 {
642         kmem_cache_free(iommu_devinfo_cache, vaddr);
643 }
644
645 static inline int domain_type_is_vm(struct dmar_domain *domain)
646 {
647         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
648 }
649
650 static inline int domain_type_is_si(struct dmar_domain *domain)
651 {
652         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
653 }
654
655 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
656 {
657         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
658                                 DOMAIN_FLAG_STATIC_IDENTITY);
659 }
660
661 static inline int domain_pfn_supported(struct dmar_domain *domain,
662                                        unsigned long pfn)
663 {
664         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
665
666         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
667 }
668
669 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
670 {
671         unsigned long sagaw;
672         int agaw = -1;
673
674         sagaw = cap_sagaw(iommu->cap);
675         for (agaw = width_to_agaw(max_gaw);
676              agaw >= 0; agaw--) {
677                 if (test_bit(agaw, &sagaw))
678                         break;
679         }
680
681         return agaw;
682 }
683
684 /*
685  * Calculate max SAGAW for each iommu.
686  */
687 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
688 {
689         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
690 }
691
692 /*
693  * calculate agaw for each iommu.
694  * "SAGAW" may be different across iommus, use a default agaw, and
695  * get a supported less agaw for iommus that don't support the default agaw.
696  */
697 int iommu_calculate_agaw(struct intel_iommu *iommu)
698 {
699         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
700 }
701
702 /* This functionin only returns single iommu in a domain */
703 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
704 {
705         int iommu_id;
706
707         /* si_domain and vm domain should not get here. */
708         BUG_ON(domain_type_is_vm_or_si(domain));
709         for_each_domain_iommu(iommu_id, domain)
710                 break;
711
712         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
713                 return NULL;
714
715         return g_iommus[iommu_id];
716 }
717
718 static void domain_update_iommu_coherency(struct dmar_domain *domain)
719 {
720         struct dmar_drhd_unit *drhd;
721         struct intel_iommu *iommu;
722         bool found = false;
723         int i;
724
725         domain->iommu_coherency = 1;
726
727         for_each_domain_iommu(i, domain) {
728                 found = true;
729                 if (!ecap_coherent(g_iommus[i]->ecap)) {
730                         domain->iommu_coherency = 0;
731                         break;
732                 }
733         }
734         if (found)
735                 return;
736
737         /* No hardware attached; use lowest common denominator */
738         rcu_read_lock();
739         for_each_active_iommu(iommu, drhd) {
740                 if (!ecap_coherent(iommu->ecap)) {
741                         domain->iommu_coherency = 0;
742                         break;
743                 }
744         }
745         rcu_read_unlock();
746 }
747
748 static int domain_update_iommu_snooping(struct intel_iommu *skip)
749 {
750         struct dmar_drhd_unit *drhd;
751         struct intel_iommu *iommu;
752         int ret = 1;
753
754         rcu_read_lock();
755         for_each_active_iommu(iommu, drhd) {
756                 if (iommu != skip) {
757                         if (!ecap_sc_support(iommu->ecap)) {
758                                 ret = 0;
759                                 break;
760                         }
761                 }
762         }
763         rcu_read_unlock();
764
765         return ret;
766 }
767
768 static int domain_update_iommu_superpage(struct intel_iommu *skip)
769 {
770         struct dmar_drhd_unit *drhd;
771         struct intel_iommu *iommu;
772         int mask = 0xf;
773
774         if (!intel_iommu_superpage) {
775                 return 0;
776         }
777
778         /* set iommu_superpage to the smallest common denominator */
779         rcu_read_lock();
780         for_each_active_iommu(iommu, drhd) {
781                 if (iommu != skip) {
782                         mask &= cap_super_page_val(iommu->cap);
783                         if (!mask)
784                                 break;
785                 }
786         }
787         rcu_read_unlock();
788
789         return fls(mask);
790 }
791
792 /* Some capabilities may be different across iommus */
793 static void domain_update_iommu_cap(struct dmar_domain *domain)
794 {
795         domain_update_iommu_coherency(domain);
796         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
797         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
798 }
799
800 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
801                                                        u8 bus, u8 devfn, int alloc)
802 {
803         struct root_entry *root = &iommu->root_entry[bus];
804         struct context_entry *context;
805         u64 *entry;
806
807         if (ecs_enabled(iommu)) {
808                 if (devfn >= 0x80) {
809                         devfn -= 0x80;
810                         entry = &root->hi;
811                 }
812                 devfn *= 2;
813         }
814         entry = &root->lo;
815         if (*entry & 1)
816                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
817         else {
818                 unsigned long phy_addr;
819                 if (!alloc)
820                         return NULL;
821
822                 context = alloc_pgtable_page(iommu->node);
823                 if (!context)
824                         return NULL;
825
826                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
827                 phy_addr = virt_to_phys((void *)context);
828                 *entry = phy_addr | 1;
829                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
830         }
831         return &context[devfn];
832 }
833
834 static int iommu_dummy(struct device *dev)
835 {
836         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
837 }
838
839 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
840 {
841         struct dmar_drhd_unit *drhd = NULL;
842         struct intel_iommu *iommu;
843         struct device *tmp;
844         struct pci_dev *ptmp, *pdev = NULL;
845         u16 segment = 0;
846         int i;
847
848         if (iommu_dummy(dev))
849                 return NULL;
850
851         if (dev_is_pci(dev)) {
852                 pdev = to_pci_dev(dev);
853                 segment = pci_domain_nr(pdev->bus);
854         } else if (has_acpi_companion(dev))
855                 dev = &ACPI_COMPANION(dev)->dev;
856
857         rcu_read_lock();
858         for_each_active_iommu(iommu, drhd) {
859                 if (pdev && segment != drhd->segment)
860                         continue;
861
862                 for_each_active_dev_scope(drhd->devices,
863                                           drhd->devices_cnt, i, tmp) {
864                         if (tmp == dev) {
865                                 *bus = drhd->devices[i].bus;
866                                 *devfn = drhd->devices[i].devfn;
867                                 goto out;
868                         }
869
870                         if (!pdev || !dev_is_pci(tmp))
871                                 continue;
872
873                         ptmp = to_pci_dev(tmp);
874                         if (ptmp->subordinate &&
875                             ptmp->subordinate->number <= pdev->bus->number &&
876                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
877                                 goto got_pdev;
878                 }
879
880                 if (pdev && drhd->include_all) {
881                 got_pdev:
882                         *bus = pdev->bus->number;
883                         *devfn = pdev->devfn;
884                         goto out;
885                 }
886         }
887         iommu = NULL;
888  out:
889         rcu_read_unlock();
890
891         return iommu;
892 }
893
894 static void domain_flush_cache(struct dmar_domain *domain,
895                                void *addr, int size)
896 {
897         if (!domain->iommu_coherency)
898                 clflush_cache_range(addr, size);
899 }
900
901 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
902 {
903         struct context_entry *context;
904         int ret = 0;
905         unsigned long flags;
906
907         spin_lock_irqsave(&iommu->lock, flags);
908         context = iommu_context_addr(iommu, bus, devfn, 0);
909         if (context)
910                 ret = context_present(context);
911         spin_unlock_irqrestore(&iommu->lock, flags);
912         return ret;
913 }
914
915 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
916 {
917         struct context_entry *context;
918         unsigned long flags;
919
920         spin_lock_irqsave(&iommu->lock, flags);
921         context = iommu_context_addr(iommu, bus, devfn, 0);
922         if (context) {
923                 context_clear_entry(context);
924                 __iommu_flush_cache(iommu, context, sizeof(*context));
925         }
926         spin_unlock_irqrestore(&iommu->lock, flags);
927 }
928
929 static void free_context_table(struct intel_iommu *iommu)
930 {
931         int i;
932         unsigned long flags;
933         struct context_entry *context;
934
935         spin_lock_irqsave(&iommu->lock, flags);
936         if (!iommu->root_entry) {
937                 goto out;
938         }
939         for (i = 0; i < ROOT_ENTRY_NR; i++) {
940                 context = iommu_context_addr(iommu, i, 0, 0);
941                 if (context)
942                         free_pgtable_page(context);
943
944                 if (!ecs_enabled(iommu))
945                         continue;
946
947                 context = iommu_context_addr(iommu, i, 0x80, 0);
948                 if (context)
949                         free_pgtable_page(context);
950
951         }
952         free_pgtable_page(iommu->root_entry);
953         iommu->root_entry = NULL;
954 out:
955         spin_unlock_irqrestore(&iommu->lock, flags);
956 }
957
958 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
959                                       unsigned long pfn, int *target_level)
960 {
961         struct dma_pte *parent, *pte = NULL;
962         int level = agaw_to_level(domain->agaw);
963         int offset;
964
965         BUG_ON(!domain->pgd);
966
967         if (!domain_pfn_supported(domain, pfn))
968                 /* Address beyond IOMMU's addressing capabilities. */
969                 return NULL;
970
971         parent = domain->pgd;
972
973         while (1) {
974                 void *tmp_page;
975
976                 offset = pfn_level_offset(pfn, level);
977                 pte = &parent[offset];
978                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
979                         break;
980                 if (level == *target_level)
981                         break;
982
983                 if (!dma_pte_present(pte)) {
984                         uint64_t pteval;
985
986                         tmp_page = alloc_pgtable_page(domain->nid);
987
988                         if (!tmp_page)
989                                 return NULL;
990
991                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
992                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
993                         if (cmpxchg64(&pte->val, 0ULL, pteval))
994                                 /* Someone else set it while we were thinking; use theirs. */
995                                 free_pgtable_page(tmp_page);
996                         else
997                                 domain_flush_cache(domain, pte, sizeof(*pte));
998                 }
999                 if (level == 1)
1000                         break;
1001
1002                 parent = phys_to_virt(dma_pte_addr(pte));
1003                 level--;
1004         }
1005
1006         if (!*target_level)
1007                 *target_level = level;
1008
1009         return pte;
1010 }
1011
1012
1013 /* return address's pte at specific level */
1014 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1015                                          unsigned long pfn,
1016                                          int level, int *large_page)
1017 {
1018         struct dma_pte *parent, *pte = NULL;
1019         int total = agaw_to_level(domain->agaw);
1020         int offset;
1021
1022         parent = domain->pgd;
1023         while (level <= total) {
1024                 offset = pfn_level_offset(pfn, total);
1025                 pte = &parent[offset];
1026                 if (level == total)
1027                         return pte;
1028
1029                 if (!dma_pte_present(pte)) {
1030                         *large_page = total;
1031                         break;
1032                 }
1033
1034                 if (dma_pte_superpage(pte)) {
1035                         *large_page = total;
1036                         return pte;
1037                 }
1038
1039                 parent = phys_to_virt(dma_pte_addr(pte));
1040                 total--;
1041         }
1042         return NULL;
1043 }
1044
1045 /* clear last level pte, a tlb flush should be followed */
1046 static void dma_pte_clear_range(struct dmar_domain *domain,
1047                                 unsigned long start_pfn,
1048                                 unsigned long last_pfn)
1049 {
1050         unsigned int large_page = 1;
1051         struct dma_pte *first_pte, *pte;
1052
1053         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1054         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1055         BUG_ON(start_pfn > last_pfn);
1056
1057         /* we don't need lock here; nobody else touches the iova range */
1058         do {
1059                 large_page = 1;
1060                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1061                 if (!pte) {
1062                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1063                         continue;
1064                 }
1065                 do {
1066                         dma_clear_pte(pte);
1067                         start_pfn += lvl_to_nr_pages(large_page);
1068                         pte++;
1069                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1070
1071                 domain_flush_cache(domain, first_pte,
1072                                    (void *)pte - (void *)first_pte);
1073
1074         } while (start_pfn && start_pfn <= last_pfn);
1075 }
1076
1077 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1078                                struct dma_pte *pte, unsigned long pfn,
1079                                unsigned long start_pfn, unsigned long last_pfn)
1080 {
1081         pfn = max(start_pfn, pfn);
1082         pte = &pte[pfn_level_offset(pfn, level)];
1083
1084         do {
1085                 unsigned long level_pfn;
1086                 struct dma_pte *level_pte;
1087
1088                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1089                         goto next;
1090
1091                 level_pfn = pfn & level_mask(level - 1);
1092                 level_pte = phys_to_virt(dma_pte_addr(pte));
1093
1094                 if (level > 2)
1095                         dma_pte_free_level(domain, level - 1, level_pte,
1096                                            level_pfn, start_pfn, last_pfn);
1097
1098                 /* If range covers entire pagetable, free it */
1099                 if (!(start_pfn > level_pfn ||
1100                       last_pfn < level_pfn + level_size(level) - 1)) {
1101                         dma_clear_pte(pte);
1102                         domain_flush_cache(domain, pte, sizeof(*pte));
1103                         free_pgtable_page(level_pte);
1104                 }
1105 next:
1106                 pfn += level_size(level);
1107         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1108 }
1109
1110 /* free page table pages. last level pte should already be cleared */
1111 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1112                                    unsigned long start_pfn,
1113                                    unsigned long last_pfn)
1114 {
1115         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1116         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1117         BUG_ON(start_pfn > last_pfn);
1118
1119         dma_pte_clear_range(domain, start_pfn, last_pfn);
1120
1121         /* We don't need lock here; nobody else touches the iova range */
1122         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1123                            domain->pgd, 0, start_pfn, last_pfn);
1124
1125         /* free pgd */
1126         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1127                 free_pgtable_page(domain->pgd);
1128                 domain->pgd = NULL;
1129         }
1130 }
1131
1132 /* When a page at a given level is being unlinked from its parent, we don't
1133    need to *modify* it at all. All we need to do is make a list of all the
1134    pages which can be freed just as soon as we've flushed the IOTLB and we
1135    know the hardware page-walk will no longer touch them.
1136    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1137    be freed. */
1138 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1139                                             int level, struct dma_pte *pte,
1140                                             struct page *freelist)
1141 {
1142         struct page *pg;
1143
1144         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1145         pg->freelist = freelist;
1146         freelist = pg;
1147
1148         if (level == 1)
1149                 return freelist;
1150
1151         pte = page_address(pg);
1152         do {
1153                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1154                         freelist = dma_pte_list_pagetables(domain, level - 1,
1155                                                            pte, freelist);
1156                 pte++;
1157         } while (!first_pte_in_page(pte));
1158
1159         return freelist;
1160 }
1161
1162 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1163                                         struct dma_pte *pte, unsigned long pfn,
1164                                         unsigned long start_pfn,
1165                                         unsigned long last_pfn,
1166                                         struct page *freelist)
1167 {
1168         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1169
1170         pfn = max(start_pfn, pfn);
1171         pte = &pte[pfn_level_offset(pfn, level)];
1172
1173         do {
1174                 unsigned long level_pfn;
1175
1176                 if (!dma_pte_present(pte))
1177                         goto next;
1178
1179                 level_pfn = pfn & level_mask(level);
1180
1181                 /* If range covers entire pagetable, free it */
1182                 if (start_pfn <= level_pfn &&
1183                     last_pfn >= level_pfn + level_size(level) - 1) {
1184                         /* These suborbinate page tables are going away entirely. Don't
1185                            bother to clear them; we're just going to *free* them. */
1186                         if (level > 1 && !dma_pte_superpage(pte))
1187                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1188
1189                         dma_clear_pte(pte);
1190                         if (!first_pte)
1191                                 first_pte = pte;
1192                         last_pte = pte;
1193                 } else if (level > 1) {
1194                         /* Recurse down into a level that isn't *entirely* obsolete */
1195                         freelist = dma_pte_clear_level(domain, level - 1,
1196                                                        phys_to_virt(dma_pte_addr(pte)),
1197                                                        level_pfn, start_pfn, last_pfn,
1198                                                        freelist);
1199                 }
1200 next:
1201                 pfn += level_size(level);
1202         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1203
1204         if (first_pte)
1205                 domain_flush_cache(domain, first_pte,
1206                                    (void *)++last_pte - (void *)first_pte);
1207
1208         return freelist;
1209 }
1210
1211 /* We can't just free the pages because the IOMMU may still be walking
1212    the page tables, and may have cached the intermediate levels. The
1213    pages can only be freed after the IOTLB flush has been done. */
1214 struct page *domain_unmap(struct dmar_domain *domain,
1215                           unsigned long start_pfn,
1216                           unsigned long last_pfn)
1217 {
1218         struct page *freelist = NULL;
1219
1220         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1221         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1222         BUG_ON(start_pfn > last_pfn);
1223
1224         /* we don't need lock here; nobody else touches the iova range */
1225         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1226                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1227
1228         /* free pgd */
1229         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1230                 struct page *pgd_page = virt_to_page(domain->pgd);
1231                 pgd_page->freelist = freelist;
1232                 freelist = pgd_page;
1233
1234                 domain->pgd = NULL;
1235         }
1236
1237         return freelist;
1238 }
1239
1240 void dma_free_pagelist(struct page *freelist)
1241 {
1242         struct page *pg;
1243
1244         while ((pg = freelist)) {
1245                 freelist = pg->freelist;
1246                 free_pgtable_page(page_address(pg));
1247         }
1248 }
1249
1250 /* iommu handling */
1251 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1252 {
1253         struct root_entry *root;
1254         unsigned long flags;
1255
1256         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1257         if (!root) {
1258                 pr_err("Allocating root entry for %s failed\n",
1259                         iommu->name);
1260                 return -ENOMEM;
1261         }
1262
1263         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1264
1265         spin_lock_irqsave(&iommu->lock, flags);
1266         iommu->root_entry = root;
1267         spin_unlock_irqrestore(&iommu->lock, flags);
1268
1269         return 0;
1270 }
1271
1272 static void iommu_set_root_entry(struct intel_iommu *iommu)
1273 {
1274         u64 addr;
1275         u32 sts;
1276         unsigned long flag;
1277
1278         addr = virt_to_phys(iommu->root_entry);
1279         if (ecs_enabled(iommu))
1280                 addr |= DMA_RTADDR_RTT;
1281
1282         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1283         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1284
1285         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1289                       readl, (sts & DMA_GSTS_RTPS), sts);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1295 {
1296         u32 val;
1297         unsigned long flag;
1298
1299         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1300                 return;
1301
1302         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1303         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1304
1305         /* Make sure hardware complete it */
1306         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1307                       readl, (!(val & DMA_GSTS_WBFS)), val);
1308
1309         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1310 }
1311
1312 /* return value determine if we need a write buffer flush */
1313 static void __iommu_flush_context(struct intel_iommu *iommu,
1314                                   u16 did, u16 source_id, u8 function_mask,
1315                                   u64 type)
1316 {
1317         u64 val = 0;
1318         unsigned long flag;
1319
1320         switch (type) {
1321         case DMA_CCMD_GLOBAL_INVL:
1322                 val = DMA_CCMD_GLOBAL_INVL;
1323                 break;
1324         case DMA_CCMD_DOMAIN_INVL:
1325                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1326                 break;
1327         case DMA_CCMD_DEVICE_INVL:
1328                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1329                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1330                 break;
1331         default:
1332                 BUG();
1333         }
1334         val |= DMA_CCMD_ICC;
1335
1336         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1337         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1338
1339         /* Make sure hardware complete it */
1340         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1341                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1342
1343         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1344 }
1345
1346 /* return value determine if we need a write buffer flush */
1347 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1348                                 u64 addr, unsigned int size_order, u64 type)
1349 {
1350         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1351         u64 val = 0, val_iva = 0;
1352         unsigned long flag;
1353
1354         switch (type) {
1355         case DMA_TLB_GLOBAL_FLUSH:
1356                 /* global flush doesn't need set IVA_REG */
1357                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1358                 break;
1359         case DMA_TLB_DSI_FLUSH:
1360                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1361                 break;
1362         case DMA_TLB_PSI_FLUSH:
1363                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1364                 /* IH bit is passed in as part of address */
1365                 val_iva = size_order | addr;
1366                 break;
1367         default:
1368                 BUG();
1369         }
1370         /* Note: set drain read/write */
1371 #if 0
1372         /*
1373          * This is probably to be super secure.. Looks like we can
1374          * ignore it without any impact.
1375          */
1376         if (cap_read_drain(iommu->cap))
1377                 val |= DMA_TLB_READ_DRAIN;
1378 #endif
1379         if (cap_write_drain(iommu->cap))
1380                 val |= DMA_TLB_WRITE_DRAIN;
1381
1382         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1383         /* Note: Only uses first TLB reg currently */
1384         if (val_iva)
1385                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1386         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1387
1388         /* Make sure hardware complete it */
1389         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1390                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1391
1392         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1393
1394         /* check IOTLB invalidation granularity */
1395         if (DMA_TLB_IAIG(val) == 0)
1396                 pr_err("Flush IOTLB failed\n");
1397         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1398                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1399                         (unsigned long long)DMA_TLB_IIRG(type),
1400                         (unsigned long long)DMA_TLB_IAIG(val));
1401 }
1402
1403 static struct device_domain_info *
1404 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1405                          u8 bus, u8 devfn)
1406 {
1407         bool found = false;
1408         struct device_domain_info *info;
1409         struct pci_dev *pdev;
1410
1411         assert_spin_locked(&device_domain_lock);
1412
1413         if (!ecap_dev_iotlb_support(iommu->ecap))
1414                 return NULL;
1415
1416         if (!iommu->qi)
1417                 return NULL;
1418
1419         list_for_each_entry(info, &domain->devices, link)
1420                 if (info->iommu == iommu && info->bus == bus &&
1421                     info->devfn == devfn) {
1422                         found = true;
1423                         break;
1424                 }
1425
1426         if (!found || !info->dev || !dev_is_pci(info->dev))
1427                 return NULL;
1428
1429         pdev = to_pci_dev(info->dev);
1430
1431         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1432                 return NULL;
1433
1434         if (!dmar_find_matched_atsr_unit(pdev))
1435                 return NULL;
1436
1437         return info;
1438 }
1439
1440 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1441 {
1442         if (!info || !dev_is_pci(info->dev))
1443                 return;
1444
1445         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1446 }
1447
1448 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1449 {
1450         if (!info->dev || !dev_is_pci(info->dev) ||
1451             !pci_ats_enabled(to_pci_dev(info->dev)))
1452                 return;
1453
1454         pci_disable_ats(to_pci_dev(info->dev));
1455 }
1456
1457 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1458                                   u64 addr, unsigned mask)
1459 {
1460         u16 sid, qdep;
1461         unsigned long flags;
1462         struct device_domain_info *info;
1463
1464         spin_lock_irqsave(&device_domain_lock, flags);
1465         list_for_each_entry(info, &domain->devices, link) {
1466                 struct pci_dev *pdev;
1467                 if (!info->dev || !dev_is_pci(info->dev))
1468                         continue;
1469
1470                 pdev = to_pci_dev(info->dev);
1471                 if (!pci_ats_enabled(pdev))
1472                         continue;
1473
1474                 sid = info->bus << 8 | info->devfn;
1475                 qdep = pci_ats_queue_depth(pdev);
1476                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1477         }
1478         spin_unlock_irqrestore(&device_domain_lock, flags);
1479 }
1480
1481 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1482                                   struct dmar_domain *domain,
1483                                   unsigned long pfn, unsigned int pages,
1484                                   int ih, int map)
1485 {
1486         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1487         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1488         u16 did = domain->iommu_did[iommu->seq_id];
1489
1490         BUG_ON(pages == 0);
1491
1492         if (ih)
1493                 ih = 1 << 6;
1494         /*
1495          * Fallback to domain selective flush if no PSI support or the size is
1496          * too big.
1497          * PSI requires page size to be 2 ^ x, and the base address is naturally
1498          * aligned to the size
1499          */
1500         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1501                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1502                                                 DMA_TLB_DSI_FLUSH);
1503         else
1504                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1505                                                 DMA_TLB_PSI_FLUSH);
1506
1507         /*
1508          * In caching mode, changes of pages from non-present to present require
1509          * flush. However, device IOTLB doesn't need to be flushed in this case.
1510          */
1511         if (!cap_caching_mode(iommu->cap) || !map)
1512                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1513                                       addr, mask);
1514 }
1515
1516 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1517 {
1518         u32 pmen;
1519         unsigned long flags;
1520
1521         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1522         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1523         pmen &= ~DMA_PMEN_EPM;
1524         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1525
1526         /* wait for the protected region status bit to clear */
1527         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1528                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1529
1530         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1531 }
1532
1533 static void iommu_enable_translation(struct intel_iommu *iommu)
1534 {
1535         u32 sts;
1536         unsigned long flags;
1537
1538         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1539         iommu->gcmd |= DMA_GCMD_TE;
1540         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1541
1542         /* Make sure hardware complete it */
1543         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1544                       readl, (sts & DMA_GSTS_TES), sts);
1545
1546         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1547 }
1548
1549 static void iommu_disable_translation(struct intel_iommu *iommu)
1550 {
1551         u32 sts;
1552         unsigned long flag;
1553
1554         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1555         iommu->gcmd &= ~DMA_GCMD_TE;
1556         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1557
1558         /* Make sure hardware complete it */
1559         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1560                       readl, (!(sts & DMA_GSTS_TES)), sts);
1561
1562         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1563 }
1564
1565
1566 static int iommu_init_domains(struct intel_iommu *iommu)
1567 {
1568         u32 ndomains, nlongs;
1569         size_t size;
1570
1571         ndomains = cap_ndoms(iommu->cap);
1572         pr_debug("%s: Number of Domains supported <%d>\n",
1573                  iommu->name, ndomains);
1574         nlongs = BITS_TO_LONGS(ndomains);
1575
1576         spin_lock_init(&iommu->lock);
1577
1578         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1579         if (!iommu->domain_ids) {
1580                 pr_err("%s: Allocating domain id array failed\n",
1581                        iommu->name);
1582                 return -ENOMEM;
1583         }
1584
1585         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1586         iommu->domains = kzalloc(size, GFP_KERNEL);
1587
1588         if (iommu->domains) {
1589                 size = 256 * sizeof(struct dmar_domain *);
1590                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1591         }
1592
1593         if (!iommu->domains || !iommu->domains[0]) {
1594                 pr_err("%s: Allocating domain array failed\n",
1595                        iommu->name);
1596                 kfree(iommu->domain_ids);
1597                 kfree(iommu->domains);
1598                 iommu->domain_ids = NULL;
1599                 iommu->domains    = NULL;
1600                 return -ENOMEM;
1601         }
1602
1603
1604
1605         /*
1606          * If Caching mode is set, then invalid translations are tagged
1607          * with domain-id 0, hence we need to pre-allocate it. We also
1608          * use domain-id 0 as a marker for non-allocated domain-id, so
1609          * make sure it is not used for a real domain.
1610          */
1611         set_bit(0, iommu->domain_ids);
1612
1613         return 0;
1614 }
1615
1616 static void disable_dmar_iommu(struct intel_iommu *iommu)
1617 {
1618         struct device_domain_info *info, *tmp;
1619         unsigned long flags;
1620
1621         if (!iommu->domains || !iommu->domain_ids)
1622                 return;
1623
1624         spin_lock_irqsave(&device_domain_lock, flags);
1625         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1626                 struct dmar_domain *domain;
1627
1628                 if (info->iommu != iommu)
1629                         continue;
1630
1631                 if (!info->dev || !info->domain)
1632                         continue;
1633
1634                 domain = info->domain;
1635
1636                 dmar_remove_one_dev_info(domain, info->dev);
1637
1638                 if (!domain_type_is_vm_or_si(domain))
1639                         domain_exit(domain);
1640         }
1641         spin_unlock_irqrestore(&device_domain_lock, flags);
1642
1643         if (iommu->gcmd & DMA_GCMD_TE)
1644                 iommu_disable_translation(iommu);
1645 }
1646
1647 static void free_dmar_iommu(struct intel_iommu *iommu)
1648 {
1649         if ((iommu->domains) && (iommu->domain_ids)) {
1650                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1651                 int i;
1652
1653                 for (i = 0; i < elems; i++)
1654                         kfree(iommu->domains[i]);
1655                 kfree(iommu->domains);
1656                 kfree(iommu->domain_ids);
1657                 iommu->domains = NULL;
1658                 iommu->domain_ids = NULL;
1659         }
1660
1661         g_iommus[iommu->seq_id] = NULL;
1662
1663         /* free context mapping */
1664         free_context_table(iommu);
1665 }
1666
1667 static struct dmar_domain *alloc_domain(int flags)
1668 {
1669         struct dmar_domain *domain;
1670
1671         domain = alloc_domain_mem();
1672         if (!domain)
1673                 return NULL;
1674
1675         memset(domain, 0, sizeof(*domain));
1676         domain->nid = -1;
1677         domain->flags = flags;
1678         INIT_LIST_HEAD(&domain->devices);
1679
1680         return domain;
1681 }
1682
1683 /* Must be called with iommu->lock */
1684 static int domain_attach_iommu(struct dmar_domain *domain,
1685                                struct intel_iommu *iommu)
1686 {
1687         unsigned long ndomains;
1688         int num;
1689
1690         assert_spin_locked(&device_domain_lock);
1691         assert_spin_locked(&iommu->lock);
1692
1693         domain->iommu_refcnt[iommu->seq_id] += 1;
1694         domain->iommu_count += 1;
1695         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1696                 ndomains = cap_ndoms(iommu->cap);
1697                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1698
1699                 if (num >= ndomains) {
1700                         pr_err("%s: No free domain ids\n", iommu->name);
1701                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1702                         domain->iommu_count -= 1;
1703                         return -ENOSPC;
1704                 }
1705
1706                 set_bit(num, iommu->domain_ids);
1707                 set_iommu_domain(iommu, num, domain);
1708
1709                 domain->iommu_did[iommu->seq_id] = num;
1710                 domain->nid                      = iommu->node;
1711
1712                 domain_update_iommu_cap(domain);
1713         }
1714
1715         return 0;
1716 }
1717
1718 static int domain_detach_iommu(struct dmar_domain *domain,
1719                                struct intel_iommu *iommu)
1720 {
1721         int num, count = INT_MAX;
1722
1723         assert_spin_locked(&device_domain_lock);
1724         assert_spin_locked(&iommu->lock);
1725
1726         domain->iommu_refcnt[iommu->seq_id] -= 1;
1727         count = --domain->iommu_count;
1728         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1729                 num = domain->iommu_did[iommu->seq_id];
1730                 clear_bit(num, iommu->domain_ids);
1731                 set_iommu_domain(iommu, num, NULL);
1732
1733                 domain_update_iommu_cap(domain);
1734                 domain->iommu_did[iommu->seq_id] = 0;
1735         }
1736
1737         return count;
1738 }
1739
1740 static struct iova_domain reserved_iova_list;
1741 static struct lock_class_key reserved_rbtree_key;
1742
1743 static int dmar_init_reserved_ranges(void)
1744 {
1745         struct pci_dev *pdev = NULL;
1746         struct iova *iova;
1747         int i;
1748
1749         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1750                         DMA_32BIT_PFN);
1751
1752         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1753                 &reserved_rbtree_key);
1754
1755         /* IOAPIC ranges shouldn't be accessed by DMA */
1756         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1757                 IOVA_PFN(IOAPIC_RANGE_END));
1758         if (!iova) {
1759                 pr_err("Reserve IOAPIC range failed\n");
1760                 return -ENODEV;
1761         }
1762
1763         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1764         for_each_pci_dev(pdev) {
1765                 struct resource *r;
1766
1767                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1768                         r = &pdev->resource[i];
1769                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1770                                 continue;
1771                         iova = reserve_iova(&reserved_iova_list,
1772                                             IOVA_PFN(r->start),
1773                                             IOVA_PFN(r->end));
1774                         if (!iova) {
1775                                 pr_err("Reserve iova failed\n");
1776                                 return -ENODEV;
1777                         }
1778                 }
1779         }
1780         return 0;
1781 }
1782
1783 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1784 {
1785         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1786 }
1787
1788 static inline int guestwidth_to_adjustwidth(int gaw)
1789 {
1790         int agaw;
1791         int r = (gaw - 12) % 9;
1792
1793         if (r == 0)
1794                 agaw = gaw;
1795         else
1796                 agaw = gaw + 9 - r;
1797         if (agaw > 64)
1798                 agaw = 64;
1799         return agaw;
1800 }
1801
1802 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1803                        int guest_width)
1804 {
1805         int adjust_width, agaw;
1806         unsigned long sagaw;
1807
1808         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1809                         DMA_32BIT_PFN);
1810         domain_reserve_special_ranges(domain);
1811
1812         /* calculate AGAW */
1813         if (guest_width > cap_mgaw(iommu->cap))
1814                 guest_width = cap_mgaw(iommu->cap);
1815         domain->gaw = guest_width;
1816         adjust_width = guestwidth_to_adjustwidth(guest_width);
1817         agaw = width_to_agaw(adjust_width);
1818         sagaw = cap_sagaw(iommu->cap);
1819         if (!test_bit(agaw, &sagaw)) {
1820                 /* hardware doesn't support it, choose a bigger one */
1821                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1822                 agaw = find_next_bit(&sagaw, 5, agaw);
1823                 if (agaw >= 5)
1824                         return -ENODEV;
1825         }
1826         domain->agaw = agaw;
1827
1828         if (ecap_coherent(iommu->ecap))
1829                 domain->iommu_coherency = 1;
1830         else
1831                 domain->iommu_coherency = 0;
1832
1833         if (ecap_sc_support(iommu->ecap))
1834                 domain->iommu_snooping = 1;
1835         else
1836                 domain->iommu_snooping = 0;
1837
1838         if (intel_iommu_superpage)
1839                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1840         else
1841                 domain->iommu_superpage = 0;
1842
1843         domain->nid = iommu->node;
1844
1845         /* always allocate the top pgd */
1846         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1847         if (!domain->pgd)
1848                 return -ENOMEM;
1849         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1850         return 0;
1851 }
1852
1853 static void domain_exit(struct dmar_domain *domain)
1854 {
1855         struct page *freelist = NULL;
1856
1857         /* Domain 0 is reserved, so dont process it */
1858         if (!domain)
1859                 return;
1860
1861         /* Flush any lazy unmaps that may reference this domain */
1862         if (!intel_iommu_strict)
1863                 flush_unmaps_timeout(0);
1864
1865         /* Remove associated devices and clear attached or cached domains */
1866         rcu_read_lock();
1867         domain_remove_dev_info(domain);
1868         rcu_read_unlock();
1869
1870         /* destroy iovas */
1871         put_iova_domain(&domain->iovad);
1872
1873         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1874
1875         dma_free_pagelist(freelist);
1876
1877         free_domain_mem(domain);
1878 }
1879
1880 static int domain_context_mapping_one(struct dmar_domain *domain,
1881                                       struct intel_iommu *iommu,
1882                                       u8 bus, u8 devfn)
1883 {
1884         u16 did = domain->iommu_did[iommu->seq_id];
1885         int translation = CONTEXT_TT_MULTI_LEVEL;
1886         struct device_domain_info *info = NULL;
1887         struct context_entry *context;
1888         unsigned long flags;
1889         struct dma_pte *pgd;
1890         int ret, agaw;
1891
1892         WARN_ON(did == 0);
1893
1894         if (hw_pass_through && domain_type_is_si(domain))
1895                 translation = CONTEXT_TT_PASS_THROUGH;
1896
1897         pr_debug("Set context mapping for %02x:%02x.%d\n",
1898                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1899
1900         BUG_ON(!domain->pgd);
1901
1902         spin_lock_irqsave(&device_domain_lock, flags);
1903         spin_lock(&iommu->lock);
1904
1905         ret = -ENOMEM;
1906         context = iommu_context_addr(iommu, bus, devfn, 1);
1907         if (!context)
1908                 goto out_unlock;
1909
1910         ret = 0;
1911         if (context_present(context))
1912                 goto out_unlock;
1913
1914         pgd = domain->pgd;
1915
1916         context_clear_entry(context);
1917         context_set_domain_id(context, did);
1918
1919         /*
1920          * Skip top levels of page tables for iommu which has less agaw
1921          * than default.  Unnecessary for PT mode.
1922          */
1923         if (translation != CONTEXT_TT_PASS_THROUGH) {
1924                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1925                         ret = -ENOMEM;
1926                         pgd = phys_to_virt(dma_pte_addr(pgd));
1927                         if (!dma_pte_present(pgd))
1928                                 goto out_unlock;
1929                 }
1930
1931                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1932                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1933                                      CONTEXT_TT_MULTI_LEVEL;
1934
1935                 context_set_address_root(context, virt_to_phys(pgd));
1936                 context_set_address_width(context, iommu->agaw);
1937         } else {
1938                 /*
1939                  * In pass through mode, AW must be programmed to
1940                  * indicate the largest AGAW value supported by
1941                  * hardware. And ASR is ignored by hardware.
1942                  */
1943                 context_set_address_width(context, iommu->msagaw);
1944         }
1945
1946         context_set_translation_type(context, translation);
1947         context_set_fault_enable(context);
1948         context_set_present(context);
1949         domain_flush_cache(domain, context, sizeof(*context));
1950
1951         /*
1952          * It's a non-present to present mapping. If hardware doesn't cache
1953          * non-present entry we only need to flush the write-buffer. If the
1954          * _does_ cache non-present entries, then it does so in the special
1955          * domain #0, which we have to flush:
1956          */
1957         if (cap_caching_mode(iommu->cap)) {
1958                 iommu->flush.flush_context(iommu, 0,
1959                                            (((u16)bus) << 8) | devfn,
1960                                            DMA_CCMD_MASK_NOBIT,
1961                                            DMA_CCMD_DEVICE_INVL);
1962                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1963         } else {
1964                 iommu_flush_write_buffer(iommu);
1965         }
1966         iommu_enable_dev_iotlb(info);
1967
1968         ret = 0;
1969
1970 out_unlock:
1971         spin_unlock(&iommu->lock);
1972         spin_unlock_irqrestore(&device_domain_lock, flags);
1973
1974         return 0;
1975 }
1976
1977 struct domain_context_mapping_data {
1978         struct dmar_domain *domain;
1979         struct intel_iommu *iommu;
1980 };
1981
1982 static int domain_context_mapping_cb(struct pci_dev *pdev,
1983                                      u16 alias, void *opaque)
1984 {
1985         struct domain_context_mapping_data *data = opaque;
1986
1987         return domain_context_mapping_one(data->domain, data->iommu,
1988                                           PCI_BUS_NUM(alias), alias & 0xff);
1989 }
1990
1991 static int
1992 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1993 {
1994         struct intel_iommu *iommu;
1995         u8 bus, devfn;
1996         struct domain_context_mapping_data data;
1997
1998         iommu = device_to_iommu(dev, &bus, &devfn);
1999         if (!iommu)
2000                 return -ENODEV;
2001
2002         if (!dev_is_pci(dev))
2003                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2004
2005         data.domain = domain;
2006         data.iommu = iommu;
2007
2008         return pci_for_each_dma_alias(to_pci_dev(dev),
2009                                       &domain_context_mapping_cb, &data);
2010 }
2011
2012 static int domain_context_mapped_cb(struct pci_dev *pdev,
2013                                     u16 alias, void *opaque)
2014 {
2015         struct intel_iommu *iommu = opaque;
2016
2017         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2018 }
2019
2020 static int domain_context_mapped(struct device *dev)
2021 {
2022         struct intel_iommu *iommu;
2023         u8 bus, devfn;
2024
2025         iommu = device_to_iommu(dev, &bus, &devfn);
2026         if (!iommu)
2027                 return -ENODEV;
2028
2029         if (!dev_is_pci(dev))
2030                 return device_context_mapped(iommu, bus, devfn);
2031
2032         return !pci_for_each_dma_alias(to_pci_dev(dev),
2033                                        domain_context_mapped_cb, iommu);
2034 }
2035
2036 /* Returns a number of VTD pages, but aligned to MM page size */
2037 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2038                                             size_t size)
2039 {
2040         host_addr &= ~PAGE_MASK;
2041         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2042 }
2043
2044 /* Return largest possible superpage level for a given mapping */
2045 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2046                                           unsigned long iov_pfn,
2047                                           unsigned long phy_pfn,
2048                                           unsigned long pages)
2049 {
2050         int support, level = 1;
2051         unsigned long pfnmerge;
2052
2053         support = domain->iommu_superpage;
2054
2055         /* To use a large page, the virtual *and* physical addresses
2056            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2057            of them will mean we have to use smaller pages. So just
2058            merge them and check both at once. */
2059         pfnmerge = iov_pfn | phy_pfn;
2060
2061         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2062                 pages >>= VTD_STRIDE_SHIFT;
2063                 if (!pages)
2064                         break;
2065                 pfnmerge >>= VTD_STRIDE_SHIFT;
2066                 level++;
2067                 support--;
2068         }
2069         return level;
2070 }
2071
2072 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2073                             struct scatterlist *sg, unsigned long phys_pfn,
2074                             unsigned long nr_pages, int prot)
2075 {
2076         struct dma_pte *first_pte = NULL, *pte = NULL;
2077         phys_addr_t uninitialized_var(pteval);
2078         unsigned long sg_res = 0;
2079         unsigned int largepage_lvl = 0;
2080         unsigned long lvl_pages = 0;
2081
2082         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2083
2084         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2085                 return -EINVAL;
2086
2087         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2088
2089         if (!sg) {
2090                 sg_res = nr_pages;
2091                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2092         }
2093
2094         while (nr_pages > 0) {
2095                 uint64_t tmp;
2096
2097                 if (!sg_res) {
2098                         sg_res = aligned_nrpages(sg->offset, sg->length);
2099                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2100                         sg->dma_length = sg->length;
2101                         pteval = page_to_phys(sg_page(sg)) | prot;
2102                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2103                 }
2104
2105                 if (!pte) {
2106                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2107
2108                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2109                         if (!pte)
2110                                 return -ENOMEM;
2111                         /* It is large page*/
2112                         if (largepage_lvl > 1) {
2113                                 pteval |= DMA_PTE_LARGE_PAGE;
2114                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2115                                 /*
2116                                  * Ensure that old small page tables are
2117                                  * removed to make room for superpage,
2118                                  * if they exist.
2119                                  */
2120                                 dma_pte_free_pagetable(domain, iov_pfn,
2121                                                        iov_pfn + lvl_pages - 1);
2122                         } else {
2123                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2124                         }
2125
2126                 }
2127                 /* We don't need lock here, nobody else
2128                  * touches the iova range
2129                  */
2130                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2131                 if (tmp) {
2132                         static int dumps = 5;
2133                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2134                                 iov_pfn, tmp, (unsigned long long)pteval);
2135                         if (dumps) {
2136                                 dumps--;
2137                                 debug_dma_dump_mappings(NULL);
2138                         }
2139                         WARN_ON(1);
2140                 }
2141
2142                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2143
2144                 BUG_ON(nr_pages < lvl_pages);
2145                 BUG_ON(sg_res < lvl_pages);
2146
2147                 nr_pages -= lvl_pages;
2148                 iov_pfn += lvl_pages;
2149                 phys_pfn += lvl_pages;
2150                 pteval += lvl_pages * VTD_PAGE_SIZE;
2151                 sg_res -= lvl_pages;
2152
2153                 /* If the next PTE would be the first in a new page, then we
2154                    need to flush the cache on the entries we've just written.
2155                    And then we'll need to recalculate 'pte', so clear it and
2156                    let it get set again in the if (!pte) block above.
2157
2158                    If we're done (!nr_pages) we need to flush the cache too.
2159
2160                    Also if we've been setting superpages, we may need to
2161                    recalculate 'pte' and switch back to smaller pages for the
2162                    end of the mapping, if the trailing size is not enough to
2163                    use another superpage (i.e. sg_res < lvl_pages). */
2164                 pte++;
2165                 if (!nr_pages || first_pte_in_page(pte) ||
2166                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2167                         domain_flush_cache(domain, first_pte,
2168                                            (void *)pte - (void *)first_pte);
2169                         pte = NULL;
2170                 }
2171
2172                 if (!sg_res && nr_pages)
2173                         sg = sg_next(sg);
2174         }
2175         return 0;
2176 }
2177
2178 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2179                                     struct scatterlist *sg, unsigned long nr_pages,
2180                                     int prot)
2181 {
2182         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2183 }
2184
2185 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2186                                      unsigned long phys_pfn, unsigned long nr_pages,
2187                                      int prot)
2188 {
2189         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2190 }
2191
2192 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2193 {
2194         if (!iommu)
2195                 return;
2196
2197         clear_context_table(iommu, bus, devfn);
2198         iommu->flush.flush_context(iommu, 0, 0, 0,
2199                                            DMA_CCMD_GLOBAL_INVL);
2200         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2201 }
2202
2203 static inline void unlink_domain_info(struct device_domain_info *info)
2204 {
2205         assert_spin_locked(&device_domain_lock);
2206         list_del(&info->link);
2207         list_del(&info->global);
2208         if (info->dev)
2209                 info->dev->archdata.iommu = NULL;
2210 }
2211
2212 static void domain_remove_dev_info(struct dmar_domain *domain)
2213 {
2214         struct device_domain_info *info, *tmp;
2215         unsigned long flags;
2216
2217         spin_lock_irqsave(&device_domain_lock, flags);
2218         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2219                 __dmar_remove_one_dev_info(domain, info->dev);
2220         spin_unlock_irqrestore(&device_domain_lock, flags);
2221 }
2222
2223 /*
2224  * find_domain
2225  * Note: we use struct device->archdata.iommu stores the info
2226  */
2227 static struct dmar_domain *find_domain(struct device *dev)
2228 {
2229         struct device_domain_info *info;
2230
2231         /* No lock here, assumes no domain exit in normal case */
2232         info = dev->archdata.iommu;
2233         if (info)
2234                 return info->domain;
2235         return NULL;
2236 }
2237
2238 static inline struct device_domain_info *
2239 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2240 {
2241         struct device_domain_info *info;
2242
2243         list_for_each_entry(info, &device_domain_list, global)
2244                 if (info->iommu->segment == segment && info->bus == bus &&
2245                     info->devfn == devfn)
2246                         return info;
2247
2248         return NULL;
2249 }
2250
2251 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2252                                                     int bus, int devfn,
2253                                                     struct device *dev,
2254                                                     struct dmar_domain *domain)
2255 {
2256         struct dmar_domain *found = NULL;
2257         struct device_domain_info *info;
2258         unsigned long flags;
2259         int ret;
2260
2261         info = alloc_devinfo_mem();
2262         if (!info)
2263                 return NULL;
2264
2265         info->bus = bus;
2266         info->devfn = devfn;
2267         info->dev = dev;
2268         info->domain = domain;
2269         info->iommu = iommu;
2270
2271         spin_lock_irqsave(&device_domain_lock, flags);
2272         if (dev)
2273                 found = find_domain(dev);
2274         else {
2275                 struct device_domain_info *info2;
2276                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2277                 if (info2)
2278                         found = info2->domain;
2279         }
2280         if (found) {
2281                 spin_unlock_irqrestore(&device_domain_lock, flags);
2282                 free_devinfo_mem(info);
2283                 /* Caller must free the original domain */
2284                 return found;
2285         }
2286
2287         spin_lock(&iommu->lock);
2288         ret = domain_attach_iommu(domain, iommu);
2289         spin_unlock(&iommu->lock);
2290
2291         if (ret) {
2292                 spin_unlock_irqrestore(&device_domain_lock, flags);
2293                 return NULL;
2294         }
2295
2296         list_add(&info->link, &domain->devices);
2297         list_add(&info->global, &device_domain_list);
2298         if (dev)
2299                 dev->archdata.iommu = info;
2300         spin_unlock_irqrestore(&device_domain_lock, flags);
2301
2302         if (dev && domain_context_mapping(domain, dev)) {
2303                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2304                 dmar_remove_one_dev_info(domain, dev);
2305                 return NULL;
2306         }
2307
2308         return domain;
2309 }
2310
2311 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2312 {
2313         *(u16 *)opaque = alias;
2314         return 0;
2315 }
2316
2317 /* domain is initialized */
2318 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2319 {
2320         struct device_domain_info *info = NULL;
2321         struct dmar_domain *domain, *tmp;
2322         struct intel_iommu *iommu;
2323         unsigned long flags;
2324         u16 dma_alias;
2325         u8 bus, devfn;
2326
2327         domain = find_domain(dev);
2328         if (domain)
2329                 return domain;
2330
2331         iommu = device_to_iommu(dev, &bus, &devfn);
2332         if (!iommu)
2333                 return NULL;
2334
2335         if (dev_is_pci(dev)) {
2336                 struct pci_dev *pdev = to_pci_dev(dev);
2337
2338                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2339
2340                 spin_lock_irqsave(&device_domain_lock, flags);
2341                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2342                                                       PCI_BUS_NUM(dma_alias),
2343                                                       dma_alias & 0xff);
2344                 if (info) {
2345                         iommu = info->iommu;
2346                         domain = info->domain;
2347                 }
2348                 spin_unlock_irqrestore(&device_domain_lock, flags);
2349
2350                 /* DMA alias already has a domain, uses it */
2351                 if (info)
2352                         goto found_domain;
2353         }
2354
2355         /* Allocate and initialize new domain for the device */
2356         domain = alloc_domain(0);
2357         if (!domain)
2358                 return NULL;
2359         if (domain_init(domain, iommu, gaw)) {
2360                 domain_exit(domain);
2361                 return NULL;
2362         }
2363
2364         /* register PCI DMA alias device */
2365         if (dev_is_pci(dev)) {
2366                 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2367                                                dma_alias & 0xff, NULL, domain);
2368
2369                 if (!tmp || tmp != domain) {
2370                         domain_exit(domain);
2371                         domain = tmp;
2372                 }
2373
2374                 if (!domain)
2375                         return NULL;
2376         }
2377
2378 found_domain:
2379         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2380
2381         if (!tmp || tmp != domain) {
2382                 domain_exit(domain);
2383                 domain = tmp;
2384         }
2385
2386         return domain;
2387 }
2388
2389 static int iommu_identity_mapping;
2390 #define IDENTMAP_ALL            1
2391 #define IDENTMAP_GFX            2
2392 #define IDENTMAP_AZALIA         4
2393
2394 static int iommu_domain_identity_map(struct dmar_domain *domain,
2395                                      unsigned long long start,
2396                                      unsigned long long end)
2397 {
2398         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2399         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2400
2401         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2402                           dma_to_mm_pfn(last_vpfn))) {
2403                 pr_err("Reserving iova failed\n");
2404                 return -ENOMEM;
2405         }
2406
2407         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2408         /*
2409          * RMRR range might have overlap with physical memory range,
2410          * clear it first
2411          */
2412         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2413
2414         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2415                                   last_vpfn - first_vpfn + 1,
2416                                   DMA_PTE_READ|DMA_PTE_WRITE);
2417 }
2418
2419 static int iommu_prepare_identity_map(struct device *dev,
2420                                       unsigned long long start,
2421                                       unsigned long long end)
2422 {
2423         struct dmar_domain *domain;
2424         int ret;
2425
2426         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2427         if (!domain)
2428                 return -ENOMEM;
2429
2430         /* For _hardware_ passthrough, don't bother. But for software
2431            passthrough, we do it anyway -- it may indicate a memory
2432            range which is reserved in E820, so which didn't get set
2433            up to start with in si_domain */
2434         if (domain == si_domain && hw_pass_through) {
2435                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2436                         dev_name(dev), start, end);
2437                 return 0;
2438         }
2439
2440         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2441                 dev_name(dev), start, end);
2442
2443         if (end < start) {
2444                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2445                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2446                         dmi_get_system_info(DMI_BIOS_VENDOR),
2447                         dmi_get_system_info(DMI_BIOS_VERSION),
2448                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2449                 ret = -EIO;
2450                 goto error;
2451         }
2452
2453         if (end >> agaw_to_width(domain->agaw)) {
2454                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2455                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2456                      agaw_to_width(domain->agaw),
2457                      dmi_get_system_info(DMI_BIOS_VENDOR),
2458                      dmi_get_system_info(DMI_BIOS_VERSION),
2459                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2460                 ret = -EIO;
2461                 goto error;
2462         }
2463
2464         ret = iommu_domain_identity_map(domain, start, end);
2465         if (ret)
2466                 goto error;
2467
2468         return 0;
2469
2470  error:
2471         domain_exit(domain);
2472         return ret;
2473 }
2474
2475 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2476                                          struct device *dev)
2477 {
2478         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2479                 return 0;
2480         return iommu_prepare_identity_map(dev, rmrr->base_address,
2481                                           rmrr->end_address);
2482 }
2483
2484 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2485 static inline void iommu_prepare_isa(void)
2486 {
2487         struct pci_dev *pdev;
2488         int ret;
2489
2490         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2491         if (!pdev)
2492                 return;
2493
2494         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2495         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2496
2497         if (ret)
2498                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2499
2500         pci_dev_put(pdev);
2501 }
2502 #else
2503 static inline void iommu_prepare_isa(void)
2504 {
2505         return;
2506 }
2507 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2508
2509 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2510
2511 static int __init si_domain_init(int hw)
2512 {
2513         int nid, ret = 0;
2514
2515         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2516         if (!si_domain)
2517                 return -EFAULT;
2518
2519         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2520                 domain_exit(si_domain);
2521                 return -EFAULT;
2522         }
2523
2524         pr_debug("Identity mapping domain allocated\n");
2525
2526         if (hw)
2527                 return 0;
2528
2529         for_each_online_node(nid) {
2530                 unsigned long start_pfn, end_pfn;
2531                 int i;
2532
2533                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2534                         ret = iommu_domain_identity_map(si_domain,
2535                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2536                         if (ret)
2537                                 return ret;
2538                 }
2539         }
2540
2541         return 0;
2542 }
2543
2544 static int identity_mapping(struct device *dev)
2545 {
2546         struct device_domain_info *info;
2547
2548         if (likely(!iommu_identity_mapping))
2549                 return 0;
2550
2551         info = dev->archdata.iommu;
2552         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2553                 return (info->domain == si_domain);
2554
2555         return 0;
2556 }
2557
2558 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2559 {
2560         struct dmar_domain *ndomain;
2561         struct intel_iommu *iommu;
2562         u8 bus, devfn;
2563
2564         iommu = device_to_iommu(dev, &bus, &devfn);
2565         if (!iommu)
2566                 return -ENODEV;
2567
2568         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2569         if (ndomain != domain)
2570                 return -EBUSY;
2571
2572         return 0;
2573 }
2574
2575 static bool device_has_rmrr(struct device *dev)
2576 {
2577         struct dmar_rmrr_unit *rmrr;
2578         struct device *tmp;
2579         int i;
2580
2581         rcu_read_lock();
2582         for_each_rmrr_units(rmrr) {
2583                 /*
2584                  * Return TRUE if this RMRR contains the device that
2585                  * is passed in.
2586                  */
2587                 for_each_active_dev_scope(rmrr->devices,
2588                                           rmrr->devices_cnt, i, tmp)
2589                         if (tmp == dev) {
2590                                 rcu_read_unlock();
2591                                 return true;
2592                         }
2593         }
2594         rcu_read_unlock();
2595         return false;
2596 }
2597
2598 /*
2599  * There are a couple cases where we need to restrict the functionality of
2600  * devices associated with RMRRs.  The first is when evaluating a device for
2601  * identity mapping because problems exist when devices are moved in and out
2602  * of domains and their respective RMRR information is lost.  This means that
2603  * a device with associated RMRRs will never be in a "passthrough" domain.
2604  * The second is use of the device through the IOMMU API.  This interface
2605  * expects to have full control of the IOVA space for the device.  We cannot
2606  * satisfy both the requirement that RMRR access is maintained and have an
2607  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2608  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2609  * We therefore prevent devices associated with an RMRR from participating in
2610  * the IOMMU API, which eliminates them from device assignment.
2611  *
2612  * In both cases we assume that PCI USB devices with RMRRs have them largely
2613  * for historical reasons and that the RMRR space is not actively used post
2614  * boot.  This exclusion may change if vendors begin to abuse it.
2615  *
2616  * The same exception is made for graphics devices, with the requirement that
2617  * any use of the RMRR regions will be torn down before assigning the device
2618  * to a guest.
2619  */
2620 static bool device_is_rmrr_locked(struct device *dev)
2621 {
2622         if (!device_has_rmrr(dev))
2623                 return false;
2624
2625         if (dev_is_pci(dev)) {
2626                 struct pci_dev *pdev = to_pci_dev(dev);
2627
2628                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2629                         return false;
2630         }
2631
2632         return true;
2633 }
2634
2635 static int iommu_should_identity_map(struct device *dev, int startup)
2636 {
2637
2638         if (dev_is_pci(dev)) {
2639                 struct pci_dev *pdev = to_pci_dev(dev);
2640
2641                 if (device_is_rmrr_locked(dev))
2642                         return 0;
2643
2644                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2645                         return 1;
2646
2647                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2648                         return 1;
2649
2650                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2651                         return 0;
2652
2653                 /*
2654                  * We want to start off with all devices in the 1:1 domain, and
2655                  * take them out later if we find they can't access all of memory.
2656                  *
2657                  * However, we can't do this for PCI devices behind bridges,
2658                  * because all PCI devices behind the same bridge will end up
2659                  * with the same source-id on their transactions.
2660                  *
2661                  * Practically speaking, we can't change things around for these
2662                  * devices at run-time, because we can't be sure there'll be no
2663                  * DMA transactions in flight for any of their siblings.
2664                  *
2665                  * So PCI devices (unless they're on the root bus) as well as
2666                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2667                  * the 1:1 domain, just in _case_ one of their siblings turns out
2668                  * not to be able to map all of memory.
2669                  */
2670                 if (!pci_is_pcie(pdev)) {
2671                         if (!pci_is_root_bus(pdev->bus))
2672                                 return 0;
2673                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2674                                 return 0;
2675                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2676                         return 0;
2677         } else {
2678                 if (device_has_rmrr(dev))
2679                         return 0;
2680         }
2681
2682         /*
2683          * At boot time, we don't yet know if devices will be 64-bit capable.
2684          * Assume that they will — if they turn out not to be, then we can
2685          * take them out of the 1:1 domain later.
2686          */
2687         if (!startup) {
2688                 /*
2689                  * If the device's dma_mask is less than the system's memory
2690                  * size then this is not a candidate for identity mapping.
2691                  */
2692                 u64 dma_mask = *dev->dma_mask;
2693
2694                 if (dev->coherent_dma_mask &&
2695                     dev->coherent_dma_mask < dma_mask)
2696                         dma_mask = dev->coherent_dma_mask;
2697
2698                 return dma_mask >= dma_get_required_mask(dev);
2699         }
2700
2701         return 1;
2702 }
2703
2704 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2705 {
2706         int ret;
2707
2708         if (!iommu_should_identity_map(dev, 1))
2709                 return 0;
2710
2711         ret = domain_add_dev_info(si_domain, dev);
2712         if (!ret)
2713                 pr_info("%s identity mapping for device %s\n",
2714                         hw ? "Hardware" : "Software", dev_name(dev));
2715         else if (ret == -ENODEV)
2716                 /* device not associated with an iommu */
2717                 ret = 0;
2718
2719         return ret;
2720 }
2721
2722
2723 static int __init iommu_prepare_static_identity_mapping(int hw)
2724 {
2725         struct pci_dev *pdev = NULL;
2726         struct dmar_drhd_unit *drhd;
2727         struct intel_iommu *iommu;
2728         struct device *dev;
2729         int i;
2730         int ret = 0;
2731
2732         for_each_pci_dev(pdev) {
2733                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2734                 if (ret)
2735                         return ret;
2736         }
2737
2738         for_each_active_iommu(iommu, drhd)
2739                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2740                         struct acpi_device_physical_node *pn;
2741                         struct acpi_device *adev;
2742
2743                         if (dev->bus != &acpi_bus_type)
2744                                 continue;
2745
2746                         adev= to_acpi_device(dev);
2747                         mutex_lock(&adev->physical_node_lock);
2748                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2749                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2750                                 if (ret)
2751                                         break;
2752                         }
2753                         mutex_unlock(&adev->physical_node_lock);
2754                         if (ret)
2755                                 return ret;
2756                 }
2757
2758         return 0;
2759 }
2760
2761 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2762 {
2763         /*
2764          * Start from the sane iommu hardware state.
2765          * If the queued invalidation is already initialized by us
2766          * (for example, while enabling interrupt-remapping) then
2767          * we got the things already rolling from a sane state.
2768          */
2769         if (!iommu->qi) {
2770                 /*
2771                  * Clear any previous faults.
2772                  */
2773                 dmar_fault(-1, iommu);
2774                 /*
2775                  * Disable queued invalidation if supported and already enabled
2776                  * before OS handover.
2777                  */
2778                 dmar_disable_qi(iommu);
2779         }
2780
2781         if (dmar_enable_qi(iommu)) {
2782                 /*
2783                  * Queued Invalidate not enabled, use Register Based Invalidate
2784                  */
2785                 iommu->flush.flush_context = __iommu_flush_context;
2786                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2787                 pr_info("%s: Using Register based invalidation\n",
2788                         iommu->name);
2789         } else {
2790                 iommu->flush.flush_context = qi_flush_context;
2791                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2792                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2793         }
2794 }
2795
2796 static int copy_context_table(struct intel_iommu *iommu,
2797                               struct root_entry *old_re,
2798                               struct context_entry **tbl,
2799                               int bus, bool ext)
2800 {
2801         struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
2802         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2803         phys_addr_t old_ce_phys;
2804
2805         tbl_idx = ext ? bus * 2 : bus;
2806
2807         for (devfn = 0; devfn < 256; devfn++) {
2808                 /* First calculate the correct index */
2809                 idx = (ext ? devfn * 2 : devfn) % 256;
2810
2811                 if (idx == 0) {
2812                         /* First save what we may have and clean up */
2813                         if (new_ce) {
2814                                 tbl[tbl_idx] = new_ce;
2815                                 __iommu_flush_cache(iommu, new_ce,
2816                                                     VTD_PAGE_SIZE);
2817                                 pos = 1;
2818                         }
2819
2820                         if (old_ce)
2821                                 iounmap(old_ce);
2822
2823                         ret = 0;
2824                         if (devfn < 0x80)
2825                                 old_ce_phys = root_entry_lctp(old_re);
2826                         else
2827                                 old_ce_phys = root_entry_uctp(old_re);
2828
2829                         if (!old_ce_phys) {
2830                                 if (ext && devfn == 0) {
2831                                         /* No LCTP, try UCTP */
2832                                         devfn = 0x7f;
2833                                         continue;
2834                                 } else {
2835                                         goto out;
2836                                 }
2837                         }
2838
2839                         ret = -ENOMEM;
2840                         old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
2841                         if (!old_ce)
2842                                 goto out;
2843
2844                         new_ce = alloc_pgtable_page(iommu->node);
2845                         if (!new_ce)
2846                                 goto out_unmap;
2847
2848                         ret = 0;
2849                 }
2850
2851                 /* Now copy the context entry */
2852                 ce = old_ce[idx];
2853
2854                 if (!__context_present(&ce))
2855                         continue;
2856
2857                 did = context_domain_id(&ce);
2858                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2859                         set_bit(did, iommu->domain_ids);
2860
2861                 /*
2862                  * We need a marker for copied context entries. This
2863                  * marker needs to work for the old format as well as
2864                  * for extended context entries.
2865                  *
2866                  * Bit 67 of the context entry is used. In the old
2867                  * format this bit is available to software, in the
2868                  * extended format it is the PGE bit, but PGE is ignored
2869                  * by HW if PASIDs are disabled (and thus still
2870                  * available).
2871                  *
2872                  * So disable PASIDs first and then mark the entry
2873                  * copied. This means that we don't copy PASID
2874                  * translations from the old kernel, but this is fine as
2875                  * faults there are not fatal.
2876                  */
2877                 context_clear_pasid_enable(&ce);
2878                 context_set_copied(&ce);
2879
2880                 new_ce[idx] = ce;
2881         }
2882
2883         tbl[tbl_idx + pos] = new_ce;
2884
2885         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2886
2887 out_unmap:
2888         iounmap(old_ce);
2889
2890 out:
2891         return ret;
2892 }
2893
2894 static int copy_translation_tables(struct intel_iommu *iommu)
2895 {
2896         struct context_entry **ctxt_tbls;
2897         struct root_entry *old_rt;
2898         phys_addr_t old_rt_phys;
2899         int ctxt_table_entries;
2900         unsigned long flags;
2901         u64 rtaddr_reg;
2902         int bus, ret;
2903         bool new_ext, ext;
2904
2905         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2906         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2907         new_ext    = !!ecap_ecs(iommu->ecap);
2908
2909         /*
2910          * The RTT bit can only be changed when translation is disabled,
2911          * but disabling translation means to open a window for data
2912          * corruption. So bail out and don't copy anything if we would
2913          * have to change the bit.
2914          */
2915         if (new_ext != ext)
2916                 return -EINVAL;
2917
2918         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2919         if (!old_rt_phys)
2920                 return -EINVAL;
2921
2922         old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
2923         if (!old_rt)
2924                 return -ENOMEM;
2925
2926         /* This is too big for the stack - allocate it from slab */
2927         ctxt_table_entries = ext ? 512 : 256;
2928         ret = -ENOMEM;
2929         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
2930         if (!ctxt_tbls)
2931                 goto out_unmap;
2932
2933         for (bus = 0; bus < 256; bus++) {
2934                 ret = copy_context_table(iommu, &old_rt[bus],
2935                                          ctxt_tbls, bus, ext);
2936                 if (ret) {
2937                         pr_err("%s: Failed to copy context table for bus %d\n",
2938                                 iommu->name, bus);
2939                         continue;
2940                 }
2941         }
2942
2943         spin_lock_irqsave(&iommu->lock, flags);
2944
2945         /* Context tables are copied, now write them to the root_entry table */
2946         for (bus = 0; bus < 256; bus++) {
2947                 int idx = ext ? bus * 2 : bus;
2948                 u64 val;
2949
2950                 if (ctxt_tbls[idx]) {
2951                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2952                         iommu->root_entry[bus].lo = val;
2953                 }
2954
2955                 if (!ext || !ctxt_tbls[idx + 1])
2956                         continue;
2957
2958                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2959                 iommu->root_entry[bus].hi = val;
2960         }
2961
2962         spin_unlock_irqrestore(&iommu->lock, flags);
2963
2964         kfree(ctxt_tbls);
2965
2966         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2967
2968         ret = 0;
2969
2970 out_unmap:
2971         iounmap(old_rt);
2972
2973         return ret;
2974 }
2975
2976 static int __init init_dmars(void)
2977 {
2978         struct dmar_drhd_unit *drhd;
2979         struct dmar_rmrr_unit *rmrr;
2980         bool copied_tables = false;
2981         struct device *dev;
2982         struct intel_iommu *iommu;
2983         int i, ret;
2984
2985         /*
2986          * for each drhd
2987          *    allocate root
2988          *    initialize and program root entry to not present
2989          * endfor
2990          */
2991         for_each_drhd_unit(drhd) {
2992                 /*
2993                  * lock not needed as this is only incremented in the single
2994                  * threaded kernel __init code path all other access are read
2995                  * only
2996                  */
2997                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2998                         g_num_of_iommus++;
2999                         continue;
3000                 }
3001                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3002         }
3003
3004         /* Preallocate enough resources for IOMMU hot-addition */
3005         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3006                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3007
3008         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3009                         GFP_KERNEL);
3010         if (!g_iommus) {
3011                 pr_err("Allocating global iommu array failed\n");
3012                 ret = -ENOMEM;
3013                 goto error;
3014         }
3015
3016         deferred_flush = kzalloc(g_num_of_iommus *
3017                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3018         if (!deferred_flush) {
3019                 ret = -ENOMEM;
3020                 goto free_g_iommus;
3021         }
3022
3023         for_each_active_iommu(iommu, drhd) {
3024                 g_iommus[iommu->seq_id] = iommu;
3025
3026                 intel_iommu_init_qi(iommu);
3027
3028                 ret = iommu_init_domains(iommu);
3029                 if (ret)
3030                         goto free_iommu;
3031
3032                 init_translation_status(iommu);
3033
3034                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3035                         iommu_disable_translation(iommu);
3036                         clear_translation_pre_enabled(iommu);
3037                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3038                                 iommu->name);
3039                 }
3040
3041                 /*
3042                  * TBD:
3043                  * we could share the same root & context tables
3044                  * among all IOMMU's. Need to Split it later.
3045                  */
3046                 ret = iommu_alloc_root_entry(iommu);
3047                 if (ret)
3048                         goto free_iommu;
3049
3050                 if (translation_pre_enabled(iommu)) {
3051                         pr_info("Translation already enabled - trying to copy translation structures\n");
3052
3053                         ret = copy_translation_tables(iommu);
3054                         if (ret) {
3055                                 /*
3056                                  * We found the IOMMU with translation
3057                                  * enabled - but failed to copy over the
3058                                  * old root-entry table. Try to proceed
3059                                  * by disabling translation now and
3060                                  * allocating a clean root-entry table.
3061                                  * This might cause DMAR faults, but
3062                                  * probably the dump will still succeed.
3063                                  */
3064                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3065                                        iommu->name);
3066                                 iommu_disable_translation(iommu);
3067                                 clear_translation_pre_enabled(iommu);
3068                         } else {
3069                                 pr_info("Copied translation tables from previous kernel for %s\n",
3070                                         iommu->name);
3071                                 copied_tables = true;
3072                         }
3073                 }
3074
3075                 iommu_flush_write_buffer(iommu);
3076                 iommu_set_root_entry(iommu);
3077                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3078                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3079
3080                 if (!ecap_pass_through(iommu->ecap))
3081                         hw_pass_through = 0;
3082         }
3083
3084         if (iommu_pass_through)
3085                 iommu_identity_mapping |= IDENTMAP_ALL;
3086
3087 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3088         iommu_identity_mapping |= IDENTMAP_GFX;
3089 #endif
3090
3091         if (iommu_identity_mapping) {
3092                 ret = si_domain_init(hw_pass_through);
3093                 if (ret)
3094                         goto free_iommu;
3095         }
3096
3097         check_tylersburg_isoch();
3098
3099         /*
3100          * If we copied translations from a previous kernel in the kdump
3101          * case, we can not assign the devices to domains now, as that
3102          * would eliminate the old mappings. So skip this part and defer
3103          * the assignment to device driver initialization time.
3104          */
3105         if (copied_tables)
3106                 goto domains_done;
3107
3108         /*
3109          * If pass through is not set or not enabled, setup context entries for
3110          * identity mappings for rmrr, gfx, and isa and may fall back to static
3111          * identity mapping if iommu_identity_mapping is set.
3112          */
3113         if (iommu_identity_mapping) {
3114                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3115                 if (ret) {
3116                         pr_crit("Failed to setup IOMMU pass-through\n");
3117                         goto free_iommu;
3118                 }
3119         }
3120         /*
3121          * For each rmrr
3122          *   for each dev attached to rmrr
3123          *   do
3124          *     locate drhd for dev, alloc domain for dev
3125          *     allocate free domain
3126          *     allocate page table entries for rmrr
3127          *     if context not allocated for bus
3128          *           allocate and init context
3129          *           set present in root table for this bus
3130          *     init context with domain, translation etc
3131          *    endfor
3132          * endfor
3133          */
3134         pr_info("Setting RMRR:\n");
3135         for_each_rmrr_units(rmrr) {
3136                 /* some BIOS lists non-exist devices in DMAR table. */
3137                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3138                                           i, dev) {
3139                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3140                         if (ret)
3141                                 pr_err("Mapping reserved region failed\n");
3142                 }
3143         }
3144
3145         iommu_prepare_isa();
3146
3147 domains_done:
3148
3149         /*
3150          * for each drhd
3151          *   enable fault log
3152          *   global invalidate context cache
3153          *   global invalidate iotlb
3154          *   enable translation
3155          */
3156         for_each_iommu(iommu, drhd) {
3157                 if (drhd->ignored) {
3158                         /*
3159                          * we always have to disable PMRs or DMA may fail on
3160                          * this device
3161                          */
3162                         if (force_on)
3163                                 iommu_disable_protect_mem_regions(iommu);
3164                         continue;
3165                 }
3166
3167                 iommu_flush_write_buffer(iommu);
3168
3169                 ret = dmar_set_interrupt(iommu);
3170                 if (ret)
3171                         goto free_iommu;
3172
3173                 if (!translation_pre_enabled(iommu))
3174                         iommu_enable_translation(iommu);
3175
3176                 iommu_disable_protect_mem_regions(iommu);
3177         }
3178
3179         return 0;
3180
3181 free_iommu:
3182         for_each_active_iommu(iommu, drhd) {
3183                 disable_dmar_iommu(iommu);
3184                 free_dmar_iommu(iommu);
3185         }
3186         kfree(deferred_flush);
3187 free_g_iommus:
3188         kfree(g_iommus);
3189 error:
3190         return ret;
3191 }
3192
3193 /* This takes a number of _MM_ pages, not VTD pages */
3194 static struct iova *intel_alloc_iova(struct device *dev,
3195                                      struct dmar_domain *domain,
3196                                      unsigned long nrpages, uint64_t dma_mask)
3197 {
3198         struct iova *iova = NULL;
3199
3200         /* Restrict dma_mask to the width that the iommu can handle */
3201         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3202
3203         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3204                 /*
3205                  * First try to allocate an io virtual address in
3206                  * DMA_BIT_MASK(32) and if that fails then try allocating
3207                  * from higher range
3208                  */
3209                 iova = alloc_iova(&domain->iovad, nrpages,
3210                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3211                 if (iova)
3212                         return iova;
3213         }
3214         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3215         if (unlikely(!iova)) {
3216                 pr_err("Allocating %ld-page iova for %s failed",
3217                        nrpages, dev_name(dev));
3218                 return NULL;
3219         }
3220
3221         return iova;
3222 }
3223
3224 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3225 {
3226         struct dmar_domain *domain;
3227
3228         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3229         if (!domain) {
3230                 pr_err("Allocating domain for %s failed\n",
3231                        dev_name(dev));
3232                 return NULL;
3233         }
3234
3235         return domain;
3236 }
3237
3238 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3239 {
3240         struct device_domain_info *info;
3241
3242         /* No lock here, assumes no domain exit in normal case */
3243         info = dev->archdata.iommu;
3244         if (likely(info))
3245                 return info->domain;
3246
3247         return __get_valid_domain_for_dev(dev);
3248 }
3249
3250 /* Check if the dev needs to go through non-identity map and unmap process.*/
3251 static int iommu_no_mapping(struct device *dev)
3252 {
3253         int found;
3254
3255         if (iommu_dummy(dev))
3256                 return 1;
3257
3258         if (!iommu_identity_mapping)
3259                 return 0;
3260
3261         found = identity_mapping(dev);
3262         if (found) {
3263                 if (iommu_should_identity_map(dev, 0))
3264                         return 1;
3265                 else {
3266                         /*
3267                          * 32 bit DMA is removed from si_domain and fall back
3268                          * to non-identity mapping.
3269                          */
3270                         dmar_remove_one_dev_info(si_domain, dev);
3271                         pr_info("32bit %s uses non-identity mapping\n",
3272                                 dev_name(dev));
3273                         return 0;
3274                 }
3275         } else {
3276                 /*
3277                  * In case of a detached 64 bit DMA device from vm, the device
3278                  * is put into si_domain for identity mapping.
3279                  */
3280                 if (iommu_should_identity_map(dev, 0)) {
3281                         int ret;
3282                         ret = domain_add_dev_info(si_domain, dev);
3283                         if (!ret) {
3284                                 pr_info("64bit %s uses identity mapping\n",
3285                                         dev_name(dev));
3286                                 return 1;
3287                         }
3288                 }
3289         }
3290
3291         return 0;
3292 }
3293
3294 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3295                                      size_t size, int dir, u64 dma_mask)
3296 {
3297         struct dmar_domain *domain;
3298         phys_addr_t start_paddr;
3299         struct iova *iova;
3300         int prot = 0;
3301         int ret;
3302         struct intel_iommu *iommu;
3303         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3304
3305         BUG_ON(dir == DMA_NONE);
3306
3307         if (iommu_no_mapping(dev))
3308                 return paddr;
3309
3310         domain = get_valid_domain_for_dev(dev);
3311         if (!domain)
3312                 return 0;
3313
3314         iommu = domain_get_iommu(domain);
3315         size = aligned_nrpages(paddr, size);
3316
3317         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3318         if (!iova)
3319                 goto error;
3320
3321         /*
3322          * Check if DMAR supports zero-length reads on write only
3323          * mappings..
3324          */
3325         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3326                         !cap_zlr(iommu->cap))
3327                 prot |= DMA_PTE_READ;
3328         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3329                 prot |= DMA_PTE_WRITE;
3330         /*
3331          * paddr - (paddr + size) might be partial page, we should map the whole
3332          * page.  Note: if two part of one page are separately mapped, we
3333          * might have two guest_addr mapping to the same host paddr, but this
3334          * is not a big problem
3335          */
3336         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3337                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3338         if (ret)
3339                 goto error;
3340
3341         /* it's a non-present to present mapping. Only flush if caching mode */
3342         if (cap_caching_mode(iommu->cap))
3343                 iommu_flush_iotlb_psi(iommu, domain,
3344                                       mm_to_dma_pfn(iova->pfn_lo),
3345                                       size, 0, 1);
3346         else
3347                 iommu_flush_write_buffer(iommu);
3348
3349         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3350         start_paddr += paddr & ~PAGE_MASK;
3351         return start_paddr;
3352
3353 error:
3354         if (iova)
3355                 __free_iova(&domain->iovad, iova);
3356         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3357                 dev_name(dev), size, (unsigned long long)paddr, dir);
3358         return 0;
3359 }
3360
3361 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3362                                  unsigned long offset, size_t size,
3363                                  enum dma_data_direction dir,
3364                                  struct dma_attrs *attrs)
3365 {
3366         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3367                                   dir, *dev->dma_mask);
3368 }
3369
3370 static void flush_unmaps(void)
3371 {
3372         int i, j;
3373
3374         timer_on = 0;
3375
3376         /* just flush them all */
3377         for (i = 0; i < g_num_of_iommus; i++) {
3378                 struct intel_iommu *iommu = g_iommus[i];
3379                 if (!iommu)
3380                         continue;
3381
3382                 if (!deferred_flush[i].next)
3383                         continue;
3384
3385                 /* In caching mode, global flushes turn emulation expensive */
3386                 if (!cap_caching_mode(iommu->cap))
3387                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3388                                          DMA_TLB_GLOBAL_FLUSH);
3389                 for (j = 0; j < deferred_flush[i].next; j++) {
3390                         unsigned long mask;
3391                         struct iova *iova = deferred_flush[i].iova[j];
3392                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3393
3394                         /* On real hardware multiple invalidations are expensive */
3395                         if (cap_caching_mode(iommu->cap))
3396                                 iommu_flush_iotlb_psi(iommu, domain,
3397                                         iova->pfn_lo, iova_size(iova),
3398                                         !deferred_flush[i].freelist[j], 0);
3399                         else {
3400                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3401                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3402                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3403                         }
3404                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3405                         if (deferred_flush[i].freelist[j])
3406                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3407                 }
3408                 deferred_flush[i].next = 0;
3409         }
3410
3411         list_size = 0;
3412 }
3413
3414 static void flush_unmaps_timeout(unsigned long data)
3415 {
3416         unsigned long flags;
3417
3418         spin_lock_irqsave(&async_umap_flush_lock, flags);
3419         flush_unmaps();
3420         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3421 }
3422
3423 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3424 {
3425         unsigned long flags;
3426         int next, iommu_id;
3427         struct intel_iommu *iommu;
3428
3429         spin_lock_irqsave(&async_umap_flush_lock, flags);
3430         if (list_size == HIGH_WATER_MARK)
3431                 flush_unmaps();
3432
3433         iommu = domain_get_iommu(dom);
3434         iommu_id = iommu->seq_id;
3435
3436         next = deferred_flush[iommu_id].next;
3437         deferred_flush[iommu_id].domain[next] = dom;
3438         deferred_flush[iommu_id].iova[next] = iova;
3439         deferred_flush[iommu_id].freelist[next] = freelist;
3440         deferred_flush[iommu_id].next++;
3441
3442         if (!timer_on) {
3443                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3444                 timer_on = 1;
3445         }
3446         list_size++;
3447         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3448 }
3449
3450 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3451 {
3452         struct dmar_domain *domain;
3453         unsigned long start_pfn, last_pfn;
3454         struct iova *iova;
3455         struct intel_iommu *iommu;
3456         struct page *freelist;
3457
3458         if (iommu_no_mapping(dev))
3459                 return;
3460
3461         domain = find_domain(dev);
3462         BUG_ON(!domain);
3463
3464         iommu = domain_get_iommu(domain);
3465
3466         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3467         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3468                       (unsigned long long)dev_addr))
3469                 return;
3470
3471         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3472         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3473
3474         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3475                  dev_name(dev), start_pfn, last_pfn);
3476
3477         freelist = domain_unmap(domain, start_pfn, last_pfn);
3478
3479         if (intel_iommu_strict) {
3480                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3481                                       last_pfn - start_pfn + 1, !freelist, 0);
3482                 /* free iova */
3483                 __free_iova(&domain->iovad, iova);
3484                 dma_free_pagelist(freelist);
3485         } else {
3486                 add_unmap(domain, iova, freelist);
3487                 /*
3488                  * queue up the release of the unmap to save the 1/6th of the
3489                  * cpu used up by the iotlb flush operation...
3490                  */
3491         }
3492 }
3493
3494 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3495                              size_t size, enum dma_data_direction dir,
3496                              struct dma_attrs *attrs)
3497 {
3498         intel_unmap(dev, dev_addr);
3499 }
3500
3501 static void *intel_alloc_coherent(struct device *dev, size_t size,
3502                                   dma_addr_t *dma_handle, gfp_t flags,
3503                                   struct dma_attrs *attrs)
3504 {
3505         struct page *page = NULL;
3506         int order;
3507
3508         size = PAGE_ALIGN(size);
3509         order = get_order(size);
3510
3511         if (!iommu_no_mapping(dev))
3512                 flags &= ~(GFP_DMA | GFP_DMA32);
3513         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3514                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3515                         flags |= GFP_DMA;
3516                 else
3517                         flags |= GFP_DMA32;
3518         }
3519
3520         if (flags & __GFP_WAIT) {
3521                 unsigned int count = size >> PAGE_SHIFT;
3522
3523                 page = dma_alloc_from_contiguous(dev, count, order);
3524                 if (page && iommu_no_mapping(dev) &&
3525                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3526                         dma_release_from_contiguous(dev, page, count);
3527                         page = NULL;
3528                 }
3529         }
3530
3531         if (!page)
3532                 page = alloc_pages(flags, order);
3533         if (!page)
3534                 return NULL;
3535         memset(page_address(page), 0, size);
3536
3537         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3538                                          DMA_BIDIRECTIONAL,
3539                                          dev->coherent_dma_mask);
3540         if (*dma_handle)
3541                 return page_address(page);
3542         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3543                 __free_pages(page, order);
3544
3545         return NULL;
3546 }
3547
3548 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3549                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3550 {
3551         int order;
3552         struct page *page = virt_to_page(vaddr);
3553
3554         size = PAGE_ALIGN(size);
3555         order = get_order(size);
3556
3557         intel_unmap(dev, dma_handle);
3558         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3559                 __free_pages(page, order);
3560 }
3561
3562 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3563                            int nelems, enum dma_data_direction dir,
3564                            struct dma_attrs *attrs)
3565 {
3566         intel_unmap(dev, sglist[0].dma_address);
3567 }
3568
3569 static int intel_nontranslate_map_sg(struct device *hddev,
3570         struct scatterlist *sglist, int nelems, int dir)
3571 {
3572         int i;
3573         struct scatterlist *sg;
3574
3575         for_each_sg(sglist, sg, nelems, i) {
3576                 BUG_ON(!sg_page(sg));
3577                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3578                 sg->dma_length = sg->length;
3579         }
3580         return nelems;
3581 }
3582
3583 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3584                         enum dma_data_direction dir, struct dma_attrs *attrs)
3585 {
3586         int i;
3587         struct dmar_domain *domain;
3588         size_t size = 0;
3589         int prot = 0;
3590         struct iova *iova = NULL;
3591         int ret;
3592         struct scatterlist *sg;
3593         unsigned long start_vpfn;
3594         struct intel_iommu *iommu;
3595
3596         BUG_ON(dir == DMA_NONE);
3597         if (iommu_no_mapping(dev))
3598                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3599
3600         domain = get_valid_domain_for_dev(dev);
3601         if (!domain)
3602                 return 0;
3603
3604         iommu = domain_get_iommu(domain);
3605
3606         for_each_sg(sglist, sg, nelems, i)
3607                 size += aligned_nrpages(sg->offset, sg->length);
3608
3609         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3610                                 *dev->dma_mask);
3611         if (!iova) {
3612                 sglist->dma_length = 0;
3613                 return 0;
3614         }
3615
3616         /*
3617          * Check if DMAR supports zero-length reads on write only
3618          * mappings..
3619          */
3620         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3621                         !cap_zlr(iommu->cap))
3622                 prot |= DMA_PTE_READ;
3623         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3624                 prot |= DMA_PTE_WRITE;
3625
3626         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3627
3628         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3629         if (unlikely(ret)) {
3630                 dma_pte_free_pagetable(domain, start_vpfn,
3631                                        start_vpfn + size - 1);
3632                 __free_iova(&domain->iovad, iova);
3633                 return 0;
3634         }
3635
3636         /* it's a non-present to present mapping. Only flush if caching mode */
3637         if (cap_caching_mode(iommu->cap))
3638                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3639         else
3640                 iommu_flush_write_buffer(iommu);
3641
3642         return nelems;
3643 }
3644
3645 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3646 {
3647         return !dma_addr;
3648 }
3649
3650 struct dma_map_ops intel_dma_ops = {
3651         .alloc = intel_alloc_coherent,
3652         .free = intel_free_coherent,
3653         .map_sg = intel_map_sg,
3654         .unmap_sg = intel_unmap_sg,
3655         .map_page = intel_map_page,
3656         .unmap_page = intel_unmap_page,
3657         .mapping_error = intel_mapping_error,
3658 };
3659
3660 static inline int iommu_domain_cache_init(void)
3661 {
3662         int ret = 0;
3663
3664         iommu_domain_cache = kmem_cache_create("iommu_domain",
3665                                          sizeof(struct dmar_domain),
3666                                          0,
3667                                          SLAB_HWCACHE_ALIGN,
3668
3669                                          NULL);
3670         if (!iommu_domain_cache) {
3671                 pr_err("Couldn't create iommu_domain cache\n");
3672                 ret = -ENOMEM;
3673         }
3674
3675         return ret;
3676 }
3677
3678 static inline int iommu_devinfo_cache_init(void)
3679 {
3680         int ret = 0;
3681
3682         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3683                                          sizeof(struct device_domain_info),
3684                                          0,
3685                                          SLAB_HWCACHE_ALIGN,
3686                                          NULL);
3687         if (!iommu_devinfo_cache) {
3688                 pr_err("Couldn't create devinfo cache\n");
3689                 ret = -ENOMEM;
3690         }
3691
3692         return ret;
3693 }
3694
3695 static int __init iommu_init_mempool(void)
3696 {
3697         int ret;
3698         ret = iommu_iova_cache_init();
3699         if (ret)
3700                 return ret;
3701
3702         ret = iommu_domain_cache_init();
3703         if (ret)
3704                 goto domain_error;
3705
3706         ret = iommu_devinfo_cache_init();
3707         if (!ret)
3708                 return ret;
3709
3710         kmem_cache_destroy(iommu_domain_cache);
3711 domain_error:
3712         iommu_iova_cache_destroy();
3713
3714         return -ENOMEM;
3715 }
3716
3717 static void __init iommu_exit_mempool(void)
3718 {
3719         kmem_cache_destroy(iommu_devinfo_cache);
3720         kmem_cache_destroy(iommu_domain_cache);
3721         iommu_iova_cache_destroy();
3722 }
3723
3724 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3725 {
3726         struct dmar_drhd_unit *drhd;
3727         u32 vtbar;
3728         int rc;
3729
3730         /* We know that this device on this chipset has its own IOMMU.
3731          * If we find it under a different IOMMU, then the BIOS is lying
3732          * to us. Hope that the IOMMU for this device is actually
3733          * disabled, and it needs no translation...
3734          */
3735         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3736         if (rc) {
3737                 /* "can't" happen */
3738                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3739                 return;
3740         }
3741         vtbar &= 0xffff0000;
3742
3743         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3744         drhd = dmar_find_matched_drhd_unit(pdev);
3745         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3746                             TAINT_FIRMWARE_WORKAROUND,
3747                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3748                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3749 }
3750 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3751
3752 static void __init init_no_remapping_devices(void)
3753 {
3754         struct dmar_drhd_unit *drhd;
3755         struct device *dev;
3756         int i;
3757
3758         for_each_drhd_unit(drhd) {
3759                 if (!drhd->include_all) {
3760                         for_each_active_dev_scope(drhd->devices,
3761                                                   drhd->devices_cnt, i, dev)
3762                                 break;
3763                         /* ignore DMAR unit if no devices exist */
3764                         if (i == drhd->devices_cnt)
3765                                 drhd->ignored = 1;
3766                 }
3767         }
3768
3769         for_each_active_drhd_unit(drhd) {
3770                 if (drhd->include_all)
3771                         continue;
3772
3773                 for_each_active_dev_scope(drhd->devices,
3774                                           drhd->devices_cnt, i, dev)
3775                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3776                                 break;
3777                 if (i < drhd->devices_cnt)
3778                         continue;
3779
3780                 /* This IOMMU has *only* gfx devices. Either bypass it or
3781                    set the gfx_mapped flag, as appropriate */
3782                 if (dmar_map_gfx) {
3783                         intel_iommu_gfx_mapped = 1;
3784                 } else {
3785                         drhd->ignored = 1;
3786                         for_each_active_dev_scope(drhd->devices,
3787                                                   drhd->devices_cnt, i, dev)
3788                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3789                 }
3790         }
3791 }
3792
3793 #ifdef CONFIG_SUSPEND
3794 static int init_iommu_hw(void)
3795 {
3796         struct dmar_drhd_unit *drhd;
3797         struct intel_iommu *iommu = NULL;
3798
3799         for_each_active_iommu(iommu, drhd)
3800                 if (iommu->qi)
3801                         dmar_reenable_qi(iommu);
3802
3803         for_each_iommu(iommu, drhd) {
3804                 if (drhd->ignored) {
3805                         /*
3806                          * we always have to disable PMRs or DMA may fail on
3807                          * this device
3808                          */
3809                         if (force_on)
3810                                 iommu_disable_protect_mem_regions(iommu);
3811                         continue;
3812                 }
3813         
3814                 iommu_flush_write_buffer(iommu);
3815
3816                 iommu_set_root_entry(iommu);
3817
3818                 iommu->flush.flush_context(iommu, 0, 0, 0,
3819                                            DMA_CCMD_GLOBAL_INVL);
3820                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3821                 iommu_enable_translation(iommu);
3822                 iommu_disable_protect_mem_regions(iommu);
3823         }
3824
3825         return 0;
3826 }
3827
3828 static void iommu_flush_all(void)
3829 {
3830         struct dmar_drhd_unit *drhd;
3831         struct intel_iommu *iommu;
3832
3833         for_each_active_iommu(iommu, drhd) {
3834                 iommu->flush.flush_context(iommu, 0, 0, 0,
3835                                            DMA_CCMD_GLOBAL_INVL);
3836                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3837                                          DMA_TLB_GLOBAL_FLUSH);
3838         }
3839 }
3840
3841 static int iommu_suspend(void)
3842 {
3843         struct dmar_drhd_unit *drhd;
3844         struct intel_iommu *iommu = NULL;
3845         unsigned long flag;
3846
3847         for_each_active_iommu(iommu, drhd) {
3848                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3849                                                  GFP_ATOMIC);
3850                 if (!iommu->iommu_state)
3851                         goto nomem;
3852         }
3853
3854         iommu_flush_all();
3855
3856         for_each_active_iommu(iommu, drhd) {
3857                 iommu_disable_translation(iommu);
3858
3859                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3860
3861                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3862                         readl(iommu->reg + DMAR_FECTL_REG);
3863                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3864                         readl(iommu->reg + DMAR_FEDATA_REG);
3865                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3866                         readl(iommu->reg + DMAR_FEADDR_REG);
3867                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3868                         readl(iommu->reg + DMAR_FEUADDR_REG);
3869
3870                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3871         }
3872         return 0;
3873
3874 nomem:
3875         for_each_active_iommu(iommu, drhd)
3876                 kfree(iommu->iommu_state);
3877
3878         return -ENOMEM;
3879 }
3880
3881 static void iommu_resume(void)
3882 {
3883         struct dmar_drhd_unit *drhd;
3884         struct intel_iommu *iommu = NULL;
3885         unsigned long flag;
3886
3887         if (init_iommu_hw()) {
3888                 if (force_on)
3889                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3890                 else
3891                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3892                 return;
3893         }
3894
3895         for_each_active_iommu(iommu, drhd) {
3896
3897                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3898
3899                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3900                         iommu->reg + DMAR_FECTL_REG);
3901                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3902                         iommu->reg + DMAR_FEDATA_REG);
3903                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3904                         iommu->reg + DMAR_FEADDR_REG);
3905                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3906                         iommu->reg + DMAR_FEUADDR_REG);
3907
3908                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3909         }
3910
3911         for_each_active_iommu(iommu, drhd)
3912                 kfree(iommu->iommu_state);
3913 }
3914
3915 static struct syscore_ops iommu_syscore_ops = {
3916         .resume         = iommu_resume,
3917         .suspend        = iommu_suspend,
3918 };
3919
3920 static void __init init_iommu_pm_ops(void)
3921 {
3922         register_syscore_ops(&iommu_syscore_ops);
3923 }
3924
3925 #else
3926 static inline void init_iommu_pm_ops(void) {}
3927 #endif  /* CONFIG_PM */
3928
3929
3930 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3931 {
3932         struct acpi_dmar_reserved_memory *rmrr;
3933         struct dmar_rmrr_unit *rmrru;
3934
3935         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3936         if (!rmrru)
3937                 return -ENOMEM;
3938
3939         rmrru->hdr = header;
3940         rmrr = (struct acpi_dmar_reserved_memory *)header;
3941         rmrru->base_address = rmrr->base_address;
3942         rmrru->end_address = rmrr->end_address;
3943         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3944                                 ((void *)rmrr) + rmrr->header.length,
3945                                 &rmrru->devices_cnt);
3946         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3947                 kfree(rmrru);
3948                 return -ENOMEM;
3949         }
3950
3951         list_add(&rmrru->list, &dmar_rmrr_units);
3952
3953         return 0;
3954 }
3955
3956 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3957 {
3958         struct dmar_atsr_unit *atsru;
3959         struct acpi_dmar_atsr *tmp;
3960
3961         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3962                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3963                 if (atsr->segment != tmp->segment)
3964                         continue;
3965                 if (atsr->header.length != tmp->header.length)
3966                         continue;
3967                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3968                         return atsru;
3969         }
3970
3971         return NULL;
3972 }
3973
3974 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3975 {
3976         struct acpi_dmar_atsr *atsr;
3977         struct dmar_atsr_unit *atsru;
3978
3979         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3980                 return 0;
3981
3982         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3983         atsru = dmar_find_atsr(atsr);
3984         if (atsru)
3985                 return 0;
3986
3987         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3988         if (!atsru)
3989                 return -ENOMEM;
3990
3991         /*
3992          * If memory is allocated from slab by ACPI _DSM method, we need to
3993          * copy the memory content because the memory buffer will be freed
3994          * on return.
3995          */
3996         atsru->hdr = (void *)(atsru + 1);
3997         memcpy(atsru->hdr, hdr, hdr->length);
3998         atsru->include_all = atsr->flags & 0x1;
3999         if (!atsru->include_all) {
4000                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4001                                 (void *)atsr + atsr->header.length,
4002                                 &atsru->devices_cnt);
4003                 if (atsru->devices_cnt && atsru->devices == NULL) {
4004                         kfree(atsru);
4005                         return -ENOMEM;
4006                 }
4007         }
4008
4009         list_add_rcu(&atsru->list, &dmar_atsr_units);
4010
4011         return 0;
4012 }
4013
4014 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4015 {
4016         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4017         kfree(atsru);
4018 }
4019
4020 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4021 {
4022         struct acpi_dmar_atsr *atsr;
4023         struct dmar_atsr_unit *atsru;
4024
4025         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4026         atsru = dmar_find_atsr(atsr);
4027         if (atsru) {
4028                 list_del_rcu(&atsru->list);
4029                 synchronize_rcu();
4030                 intel_iommu_free_atsr(atsru);
4031         }
4032
4033         return 0;
4034 }
4035
4036 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4037 {
4038         int i;
4039         struct device *dev;
4040         struct acpi_dmar_atsr *atsr;
4041         struct dmar_atsr_unit *atsru;
4042
4043         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4044         atsru = dmar_find_atsr(atsr);
4045         if (!atsru)
4046                 return 0;
4047
4048         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
4049                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4050                                           i, dev)
4051                         return -EBUSY;
4052
4053         return 0;
4054 }
4055
4056 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4057 {
4058         int sp, ret = 0;
4059         struct intel_iommu *iommu = dmaru->iommu;
4060
4061         if (g_iommus[iommu->seq_id])
4062                 return 0;
4063
4064         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4065                 pr_warn("%s: Doesn't support hardware pass through.\n",
4066                         iommu->name);
4067                 return -ENXIO;
4068         }
4069         if (!ecap_sc_support(iommu->ecap) &&
4070             domain_update_iommu_snooping(iommu)) {
4071                 pr_warn("%s: Doesn't support snooping.\n",
4072                         iommu->name);
4073                 return -ENXIO;
4074         }
4075         sp = domain_update_iommu_superpage(iommu) - 1;
4076         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4077                 pr_warn("%s: Doesn't support large page.\n",
4078                         iommu->name);
4079                 return -ENXIO;
4080         }
4081
4082         /*
4083          * Disable translation if already enabled prior to OS handover.
4084          */
4085         if (iommu->gcmd & DMA_GCMD_TE)
4086                 iommu_disable_translation(iommu);
4087
4088         g_iommus[iommu->seq_id] = iommu;
4089         ret = iommu_init_domains(iommu);
4090         if (ret == 0)
4091                 ret = iommu_alloc_root_entry(iommu);
4092         if (ret)
4093                 goto out;
4094
4095         if (dmaru->ignored) {
4096                 /*
4097                  * we always have to disable PMRs or DMA may fail on this device
4098                  */
4099                 if (force_on)
4100                         iommu_disable_protect_mem_regions(iommu);
4101                 return 0;
4102         }
4103
4104         intel_iommu_init_qi(iommu);
4105         iommu_flush_write_buffer(iommu);
4106         ret = dmar_set_interrupt(iommu);
4107         if (ret)
4108                 goto disable_iommu;
4109
4110         iommu_set_root_entry(iommu);
4111         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4112         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4113         iommu_enable_translation(iommu);
4114
4115         iommu_disable_protect_mem_regions(iommu);
4116         return 0;
4117
4118 disable_iommu:
4119         disable_dmar_iommu(iommu);
4120 out:
4121         free_dmar_iommu(iommu);
4122         return ret;
4123 }
4124
4125 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4126 {
4127         int ret = 0;
4128         struct intel_iommu *iommu = dmaru->iommu;
4129
4130         if (!intel_iommu_enabled)
4131                 return 0;
4132         if (iommu == NULL)
4133                 return -EINVAL;
4134
4135         if (insert) {
4136                 ret = intel_iommu_add(dmaru);
4137         } else {
4138                 disable_dmar_iommu(iommu);
4139                 free_dmar_iommu(iommu);
4140         }
4141
4142         return ret;
4143 }
4144
4145 static void intel_iommu_free_dmars(void)
4146 {
4147         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4148         struct dmar_atsr_unit *atsru, *atsr_n;
4149
4150         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4151                 list_del(&rmrru->list);
4152                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4153                 kfree(rmrru);
4154         }
4155
4156         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4157                 list_del(&atsru->list);
4158                 intel_iommu_free_atsr(atsru);
4159         }
4160 }
4161
4162 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4163 {
4164         int i, ret = 1;
4165         struct pci_bus *bus;
4166         struct pci_dev *bridge = NULL;
4167         struct device *tmp;
4168         struct acpi_dmar_atsr *atsr;
4169         struct dmar_atsr_unit *atsru;
4170
4171         dev = pci_physfn(dev);
4172         for (bus = dev->bus; bus; bus = bus->parent) {
4173                 bridge = bus->self;
4174                 if (!bridge || !pci_is_pcie(bridge) ||
4175                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4176                         return 0;
4177                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4178                         break;
4179         }
4180         if (!bridge)
4181                 return 0;
4182
4183         rcu_read_lock();
4184         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4185                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4186                 if (atsr->segment != pci_domain_nr(dev->bus))
4187                         continue;
4188
4189                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4190                         if (tmp == &bridge->dev)
4191                                 goto out;
4192
4193                 if (atsru->include_all)
4194                         goto out;
4195         }
4196         ret = 0;
4197 out:
4198         rcu_read_unlock();
4199
4200         return ret;
4201 }
4202
4203 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4204 {
4205         int ret = 0;
4206         struct dmar_rmrr_unit *rmrru;
4207         struct dmar_atsr_unit *atsru;
4208         struct acpi_dmar_atsr *atsr;
4209         struct acpi_dmar_reserved_memory *rmrr;
4210
4211         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4212                 return 0;
4213
4214         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4215                 rmrr = container_of(rmrru->hdr,
4216                                     struct acpi_dmar_reserved_memory, header);
4217                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4218                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4219                                 ((void *)rmrr) + rmrr->header.length,
4220                                 rmrr->segment, rmrru->devices,
4221                                 rmrru->devices_cnt);
4222                         if(ret < 0)
4223                                 return ret;
4224                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4225                         dmar_remove_dev_scope(info, rmrr->segment,
4226                                 rmrru->devices, rmrru->devices_cnt);
4227                 }
4228         }
4229
4230         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4231                 if (atsru->include_all)
4232                         continue;
4233
4234                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4235                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4236                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4237                                         (void *)atsr + atsr->header.length,
4238                                         atsr->segment, atsru->devices,
4239                                         atsru->devices_cnt);
4240                         if (ret > 0)
4241                                 break;
4242                         else if(ret < 0)
4243                                 return ret;
4244                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4245                         if (dmar_remove_dev_scope(info, atsr->segment,
4246                                         atsru->devices, atsru->devices_cnt))
4247                                 break;
4248                 }
4249         }
4250
4251         return 0;
4252 }
4253
4254 /*
4255  * Here we only respond to action of unbound device from driver.
4256  *
4257  * Added device is not attached to its DMAR domain here yet. That will happen
4258  * when mapping the device to iova.
4259  */
4260 static int device_notifier(struct notifier_block *nb,
4261                                   unsigned long action, void *data)
4262 {
4263         struct device *dev = data;
4264         struct dmar_domain *domain;
4265
4266         if (iommu_dummy(dev))
4267                 return 0;
4268
4269         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4270                 return 0;
4271
4272         domain = find_domain(dev);
4273         if (!domain)
4274                 return 0;
4275
4276         dmar_remove_one_dev_info(domain, dev);
4277         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4278                 domain_exit(domain);
4279
4280         return 0;
4281 }
4282
4283 static struct notifier_block device_nb = {
4284         .notifier_call = device_notifier,
4285 };
4286
4287 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4288                                        unsigned long val, void *v)
4289 {
4290         struct memory_notify *mhp = v;
4291         unsigned long long start, end;
4292         unsigned long start_vpfn, last_vpfn;
4293
4294         switch (val) {
4295         case MEM_GOING_ONLINE:
4296                 start = mhp->start_pfn << PAGE_SHIFT;
4297                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4298                 if (iommu_domain_identity_map(si_domain, start, end)) {
4299                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4300                                 start, end);
4301                         return NOTIFY_BAD;
4302                 }
4303                 break;
4304
4305         case MEM_OFFLINE:
4306         case MEM_CANCEL_ONLINE:
4307                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4308                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4309                 while (start_vpfn <= last_vpfn) {
4310                         struct iova *iova;
4311                         struct dmar_drhd_unit *drhd;
4312                         struct intel_iommu *iommu;
4313                         struct page *freelist;
4314
4315                         iova = find_iova(&si_domain->iovad, start_vpfn);
4316                         if (iova == NULL) {
4317                                 pr_debug("Failed get IOVA for PFN %lx\n",
4318                                          start_vpfn);
4319                                 break;
4320                         }
4321
4322                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4323                                                      start_vpfn, last_vpfn);
4324                         if (iova == NULL) {
4325                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4326                                         start_vpfn, last_vpfn);
4327                                 return NOTIFY_BAD;
4328                         }
4329
4330                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4331                                                iova->pfn_hi);
4332
4333                         rcu_read_lock();
4334                         for_each_active_iommu(iommu, drhd)
4335                                 iommu_flush_iotlb_psi(iommu, si_domain,
4336                                         iova->pfn_lo, iova_size(iova),
4337                                         !freelist, 0);
4338                         rcu_read_unlock();
4339                         dma_free_pagelist(freelist);
4340
4341                         start_vpfn = iova->pfn_hi + 1;
4342                         free_iova_mem(iova);
4343                 }
4344                 break;
4345         }
4346
4347         return NOTIFY_OK;
4348 }
4349
4350 static struct notifier_block intel_iommu_memory_nb = {
4351         .notifier_call = intel_iommu_memory_notifier,
4352         .priority = 0
4353 };
4354
4355
4356 static ssize_t intel_iommu_show_version(struct device *dev,
4357                                         struct device_attribute *attr,
4358                                         char *buf)
4359 {
4360         struct intel_iommu *iommu = dev_get_drvdata(dev);
4361         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4362         return sprintf(buf, "%d:%d\n",
4363                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4364 }
4365 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4366
4367 static ssize_t intel_iommu_show_address(struct device *dev,
4368                                         struct device_attribute *attr,
4369                                         char *buf)
4370 {
4371         struct intel_iommu *iommu = dev_get_drvdata(dev);
4372         return sprintf(buf, "%llx\n", iommu->reg_phys);
4373 }
4374 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4375
4376 static ssize_t intel_iommu_show_cap(struct device *dev,
4377                                     struct device_attribute *attr,
4378                                     char *buf)
4379 {
4380         struct intel_iommu *iommu = dev_get_drvdata(dev);
4381         return sprintf(buf, "%llx\n", iommu->cap);
4382 }
4383 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4384
4385 static ssize_t intel_iommu_show_ecap(struct device *dev,
4386                                     struct device_attribute *attr,
4387                                     char *buf)
4388 {
4389         struct intel_iommu *iommu = dev_get_drvdata(dev);
4390         return sprintf(buf, "%llx\n", iommu->ecap);
4391 }
4392 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4393
4394 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4395                                       struct device_attribute *attr,
4396                                       char *buf)
4397 {
4398         struct intel_iommu *iommu = dev_get_drvdata(dev);
4399         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4400 }
4401 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4402
4403 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4404                                            struct device_attribute *attr,
4405                                            char *buf)
4406 {
4407         struct intel_iommu *iommu = dev_get_drvdata(dev);
4408         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4409                                                   cap_ndoms(iommu->cap)));
4410 }
4411 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4412
4413 static struct attribute *intel_iommu_attrs[] = {
4414         &dev_attr_version.attr,
4415         &dev_attr_address.attr,
4416         &dev_attr_cap.attr,
4417         &dev_attr_ecap.attr,
4418         &dev_attr_domains_supported.attr,
4419         &dev_attr_domains_used.attr,
4420         NULL,
4421 };
4422
4423 static struct attribute_group intel_iommu_group = {
4424         .name = "intel-iommu",
4425         .attrs = intel_iommu_attrs,
4426 };
4427
4428 const struct attribute_group *intel_iommu_groups[] = {
4429         &intel_iommu_group,
4430         NULL,
4431 };
4432
4433 int __init intel_iommu_init(void)
4434 {
4435         int ret = -ENODEV;
4436         struct dmar_drhd_unit *drhd;
4437         struct intel_iommu *iommu;
4438
4439         /* VT-d is required for a TXT/tboot launch, so enforce that */
4440         force_on = tboot_force_iommu();
4441
4442         if (iommu_init_mempool()) {
4443                 if (force_on)
4444                         panic("tboot: Failed to initialize iommu memory\n");
4445                 return -ENOMEM;
4446         }
4447
4448         down_write(&dmar_global_lock);
4449         if (dmar_table_init()) {
4450                 if (force_on)
4451                         panic("tboot: Failed to initialize DMAR table\n");
4452                 goto out_free_dmar;
4453         }
4454
4455         if (dmar_dev_scope_init() < 0) {
4456                 if (force_on)
4457                         panic("tboot: Failed to initialize DMAR device scope\n");
4458                 goto out_free_dmar;
4459         }
4460
4461         if (no_iommu || dmar_disabled)
4462                 goto out_free_dmar;
4463
4464         if (list_empty(&dmar_rmrr_units))
4465                 pr_info("No RMRR found\n");
4466
4467         if (list_empty(&dmar_atsr_units))
4468                 pr_info("No ATSR found\n");
4469
4470         if (dmar_init_reserved_ranges()) {
4471                 if (force_on)
4472                         panic("tboot: Failed to reserve iommu ranges\n");
4473                 goto out_free_reserved_range;
4474         }
4475
4476         init_no_remapping_devices();
4477
4478         ret = init_dmars();
4479         if (ret) {
4480                 if (force_on)
4481                         panic("tboot: Failed to initialize DMARs\n");
4482                 pr_err("Initialization failed\n");
4483                 goto out_free_reserved_range;
4484         }
4485         up_write(&dmar_global_lock);
4486         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4487
4488         init_timer(&unmap_timer);
4489 #ifdef CONFIG_SWIOTLB
4490         swiotlb = 0;
4491 #endif
4492         dma_ops = &intel_dma_ops;
4493
4494         init_iommu_pm_ops();
4495
4496         for_each_active_iommu(iommu, drhd)
4497                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4498                                                        intel_iommu_groups,
4499                                                        "%s", iommu->name);
4500
4501         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4502         bus_register_notifier(&pci_bus_type, &device_nb);
4503         if (si_domain && !hw_pass_through)
4504                 register_memory_notifier(&intel_iommu_memory_nb);
4505
4506         intel_iommu_enabled = 1;
4507
4508         return 0;
4509
4510 out_free_reserved_range:
4511         put_iova_domain(&reserved_iova_list);
4512 out_free_dmar:
4513         intel_iommu_free_dmars();
4514         up_write(&dmar_global_lock);
4515         iommu_exit_mempool();
4516         return ret;
4517 }
4518
4519 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4520 {
4521         struct intel_iommu *iommu = opaque;
4522
4523         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4524         return 0;
4525 }
4526
4527 /*
4528  * NB - intel-iommu lacks any sort of reference counting for the users of
4529  * dependent devices.  If multiple endpoints have intersecting dependent
4530  * devices, unbinding the driver from any one of them will possibly leave
4531  * the others unable to operate.
4532  */
4533 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4534 {
4535         if (!iommu || !dev || !dev_is_pci(dev))
4536                 return;
4537
4538         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4539 }
4540
4541 static void __dmar_remove_one_dev_info(struct dmar_domain *domain,
4542                                        struct device *dev)
4543 {
4544         struct device_domain_info *info;
4545         struct intel_iommu *iommu;
4546         unsigned long flags;
4547         u8 bus, devfn;
4548
4549         assert_spin_locked(&device_domain_lock);
4550
4551         iommu = device_to_iommu(dev, &bus, &devfn);
4552         if (!iommu)
4553                 return;
4554
4555         info = dev->archdata.iommu;
4556
4557         if (WARN_ON(!info))
4558                 return;
4559
4560         unlink_domain_info(info);
4561
4562         iommu_disable_dev_iotlb(info);
4563         domain_context_clear(iommu, dev);
4564         free_devinfo_mem(info);
4565
4566         spin_lock_irqsave(&iommu->lock, flags);
4567         domain_detach_iommu(domain, iommu);
4568         spin_unlock_irqrestore(&iommu->lock, flags);
4569 }
4570
4571 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4572                                      struct device *dev)
4573 {
4574         unsigned long flags;
4575
4576         spin_lock_irqsave(&device_domain_lock, flags);
4577         __dmar_remove_one_dev_info(domain, dev);
4578         spin_unlock_irqrestore(&device_domain_lock, flags);
4579 }
4580
4581 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4582 {
4583         int adjust_width;
4584
4585         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4586                         DMA_32BIT_PFN);
4587         domain_reserve_special_ranges(domain);
4588
4589         /* calculate AGAW */
4590         domain->gaw = guest_width;
4591         adjust_width = guestwidth_to_adjustwidth(guest_width);
4592         domain->agaw = width_to_agaw(adjust_width);
4593
4594         domain->iommu_coherency = 0;
4595         domain->iommu_snooping = 0;
4596         domain->iommu_superpage = 0;
4597         domain->max_addr = 0;
4598
4599         /* always allocate the top pgd */
4600         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4601         if (!domain->pgd)
4602                 return -ENOMEM;
4603         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4604         return 0;
4605 }
4606
4607 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4608 {
4609         struct dmar_domain *dmar_domain;
4610         struct iommu_domain *domain;
4611
4612         if (type != IOMMU_DOMAIN_UNMANAGED)
4613                 return NULL;
4614
4615         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4616         if (!dmar_domain) {
4617                 pr_err("Can't allocate dmar_domain\n");
4618                 return NULL;
4619         }
4620         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4621                 pr_err("Domain initialization failed\n");
4622                 domain_exit(dmar_domain);
4623                 return NULL;
4624         }
4625         domain_update_iommu_cap(dmar_domain);
4626
4627         domain = &dmar_domain->domain;
4628         domain->geometry.aperture_start = 0;
4629         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4630         domain->geometry.force_aperture = true;
4631
4632         return domain;
4633 }
4634
4635 static void intel_iommu_domain_free(struct iommu_domain *domain)
4636 {
4637         domain_exit(to_dmar_domain(domain));
4638 }
4639
4640 static int intel_iommu_attach_device(struct iommu_domain *domain,
4641                                      struct device *dev)
4642 {
4643         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4644         struct intel_iommu *iommu;
4645         int addr_width;
4646         u8 bus, devfn;
4647
4648         if (device_is_rmrr_locked(dev)) {
4649                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4650                 return -EPERM;
4651         }
4652
4653         /* normally dev is not mapped */
4654         if (unlikely(domain_context_mapped(dev))) {
4655                 struct dmar_domain *old_domain;
4656
4657                 old_domain = find_domain(dev);
4658                 if (old_domain) {
4659                         rcu_read_lock();
4660                         dmar_remove_one_dev_info(old_domain, dev);
4661                         rcu_read_unlock();
4662
4663                         if (!domain_type_is_vm_or_si(old_domain) &&
4664                              list_empty(&old_domain->devices))
4665                                 domain_exit(old_domain);
4666                 }
4667         }
4668
4669         iommu = device_to_iommu(dev, &bus, &devfn);
4670         if (!iommu)
4671                 return -ENODEV;
4672
4673         /* check if this iommu agaw is sufficient for max mapped address */
4674         addr_width = agaw_to_width(iommu->agaw);
4675         if (addr_width > cap_mgaw(iommu->cap))
4676                 addr_width = cap_mgaw(iommu->cap);
4677
4678         if (dmar_domain->max_addr > (1LL << addr_width)) {
4679                 pr_err("%s: iommu width (%d) is not "
4680                        "sufficient for the mapped address (%llx)\n",
4681                        __func__, addr_width, dmar_domain->max_addr);
4682                 return -EFAULT;
4683         }
4684         dmar_domain->gaw = addr_width;
4685
4686         /*
4687          * Knock out extra levels of page tables if necessary
4688          */
4689         while (iommu->agaw < dmar_domain->agaw) {
4690                 struct dma_pte *pte;
4691
4692                 pte = dmar_domain->pgd;
4693                 if (dma_pte_present(pte)) {
4694                         dmar_domain->pgd = (struct dma_pte *)
4695                                 phys_to_virt(dma_pte_addr(pte));
4696                         free_pgtable_page(pte);
4697                 }
4698                 dmar_domain->agaw--;
4699         }
4700
4701         return domain_add_dev_info(dmar_domain, dev);
4702 }
4703
4704 static void intel_iommu_detach_device(struct iommu_domain *domain,
4705                                       struct device *dev)
4706 {
4707         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4708 }
4709
4710 static int intel_iommu_map(struct iommu_domain *domain,
4711                            unsigned long iova, phys_addr_t hpa,
4712                            size_t size, int iommu_prot)
4713 {
4714         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4715         u64 max_addr;
4716         int prot = 0;
4717         int ret;
4718
4719         if (iommu_prot & IOMMU_READ)
4720                 prot |= DMA_PTE_READ;
4721         if (iommu_prot & IOMMU_WRITE)
4722                 prot |= DMA_PTE_WRITE;
4723         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4724                 prot |= DMA_PTE_SNP;
4725
4726         max_addr = iova + size;
4727         if (dmar_domain->max_addr < max_addr) {
4728                 u64 end;
4729
4730                 /* check if minimum agaw is sufficient for mapped address */
4731                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4732                 if (end < max_addr) {
4733                         pr_err("%s: iommu width (%d) is not "
4734                                "sufficient for the mapped address (%llx)\n",
4735                                __func__, dmar_domain->gaw, max_addr);
4736                         return -EFAULT;
4737                 }
4738                 dmar_domain->max_addr = max_addr;
4739         }
4740         /* Round up size to next multiple of PAGE_SIZE, if it and
4741            the low bits of hpa would take us onto the next page */
4742         size = aligned_nrpages(hpa, size);
4743         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4744                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4745         return ret;
4746 }
4747
4748 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4749                                 unsigned long iova, size_t size)
4750 {
4751         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4752         struct page *freelist = NULL;
4753         struct intel_iommu *iommu;
4754         unsigned long start_pfn, last_pfn;
4755         unsigned int npages;
4756         int iommu_id, level = 0;
4757
4758         /* Cope with horrid API which requires us to unmap more than the
4759            size argument if it happens to be a large-page mapping. */
4760         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4761                 BUG();
4762
4763         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4764                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4765
4766         start_pfn = iova >> VTD_PAGE_SHIFT;
4767         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4768
4769         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4770
4771         npages = last_pfn - start_pfn + 1;
4772
4773         for_each_domain_iommu(iommu_id, dmar_domain) {
4774                 iommu = g_iommus[iommu_id];
4775
4776                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4777                                       start_pfn, npages, !freelist, 0);
4778         }
4779
4780         dma_free_pagelist(freelist);
4781
4782         if (dmar_domain->max_addr == iova + size)
4783                 dmar_domain->max_addr = iova;
4784
4785         return size;
4786 }
4787
4788 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4789                                             dma_addr_t iova)
4790 {
4791         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4792         struct dma_pte *pte;
4793         int level = 0;
4794         u64 phys = 0;
4795
4796         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4797         if (pte)
4798                 phys = dma_pte_addr(pte);
4799
4800         return phys;
4801 }
4802
4803 static bool intel_iommu_capable(enum iommu_cap cap)
4804 {
4805         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4806                 return domain_update_iommu_snooping(NULL) == 1;
4807         if (cap == IOMMU_CAP_INTR_REMAP)
4808                 return irq_remapping_enabled == 1;
4809
4810         return false;
4811 }
4812
4813 static int intel_iommu_add_device(struct device *dev)
4814 {
4815         struct intel_iommu *iommu;
4816         struct iommu_group *group;
4817         u8 bus, devfn;
4818
4819         iommu = device_to_iommu(dev, &bus, &devfn);
4820         if (!iommu)
4821                 return -ENODEV;
4822
4823         iommu_device_link(iommu->iommu_dev, dev);
4824
4825         group = iommu_group_get_for_dev(dev);
4826
4827         if (IS_ERR(group))
4828                 return PTR_ERR(group);
4829
4830         iommu_group_put(group);
4831         return 0;
4832 }
4833
4834 static void intel_iommu_remove_device(struct device *dev)
4835 {
4836         struct intel_iommu *iommu;
4837         u8 bus, devfn;
4838
4839         iommu = device_to_iommu(dev, &bus, &devfn);
4840         if (!iommu)
4841                 return;
4842
4843         iommu_group_remove_device(dev);
4844
4845         iommu_device_unlink(iommu->iommu_dev, dev);
4846 }
4847
4848 static const struct iommu_ops intel_iommu_ops = {
4849         .capable        = intel_iommu_capable,
4850         .domain_alloc   = intel_iommu_domain_alloc,
4851         .domain_free    = intel_iommu_domain_free,
4852         .attach_dev     = intel_iommu_attach_device,
4853         .detach_dev     = intel_iommu_detach_device,
4854         .map            = intel_iommu_map,
4855         .unmap          = intel_iommu_unmap,
4856         .map_sg         = default_iommu_map_sg,
4857         .iova_to_phys   = intel_iommu_iova_to_phys,
4858         .add_device     = intel_iommu_add_device,
4859         .remove_device  = intel_iommu_remove_device,
4860         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4861 };
4862
4863 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4864 {
4865         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4866         pr_info("Disabling IOMMU for graphics on this chipset\n");
4867         dmar_map_gfx = 0;
4868 }
4869
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4877
4878 static void quirk_iommu_rwbf(struct pci_dev *dev)
4879 {
4880         /*
4881          * Mobile 4 Series Chipset neglects to set RWBF capability,
4882          * but needs it. Same seems to hold for the desktop versions.
4883          */
4884         pr_info("Forcing write-buffer flush capability\n");
4885         rwbf_quirk = 1;
4886 }
4887
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4895
4896 #define GGC 0x52
4897 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4898 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4899 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4900 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4901 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4902 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4903 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4904 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4905
4906 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4907 {
4908         unsigned short ggc;
4909
4910         if (pci_read_config_word(dev, GGC, &ggc))
4911                 return;
4912
4913         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4914                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4915                 dmar_map_gfx = 0;
4916         } else if (dmar_map_gfx) {
4917                 /* we have to ensure the gfx device is idle before we flush */
4918                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
4919                 intel_iommu_strict = 1;
4920        }
4921 }
4922 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4926
4927 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4928    ISOCH DMAR unit for the Azalia sound device, but not give it any
4929    TLB entries, which causes it to deadlock. Check for that.  We do
4930    this in a function called from init_dmars(), instead of in a PCI
4931    quirk, because we don't want to print the obnoxious "BIOS broken"
4932    message if VT-d is actually disabled.
4933 */
4934 static void __init check_tylersburg_isoch(void)
4935 {
4936         struct pci_dev *pdev;
4937         uint32_t vtisochctrl;
4938
4939         /* If there's no Azalia in the system anyway, forget it. */
4940         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4941         if (!pdev)
4942                 return;
4943         pci_dev_put(pdev);
4944
4945         /* System Management Registers. Might be hidden, in which case
4946            we can't do the sanity check. But that's OK, because the
4947            known-broken BIOSes _don't_ actually hide it, so far. */
4948         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4949         if (!pdev)
4950                 return;
4951
4952         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4953                 pci_dev_put(pdev);
4954                 return;
4955         }
4956
4957         pci_dev_put(pdev);
4958
4959         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4960         if (vtisochctrl & 1)
4961                 return;
4962
4963         /* Drop all bits other than the number of TLB entries */
4964         vtisochctrl &= 0x1c;
4965
4966         /* If we have the recommended number of TLB entries (16), fine. */
4967         if (vtisochctrl == 0x10)
4968                 return;
4969
4970         /* Zero TLB entries? You get to ride the short bus to school. */
4971         if (!vtisochctrl) {
4972                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4973                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4974                      dmi_get_system_info(DMI_BIOS_VENDOR),
4975                      dmi_get_system_info(DMI_BIOS_VERSION),
4976                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4977                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4978                 return;
4979         }
4980
4981         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4982                vtisochctrl);
4983 }