Merge branch 'linux-linaro-lsk-v4.4' into linux-linaro-lsk-v4.4-android
[firefly-linux-kernel-4.4.55.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/io.h>
38 #include <linux/iova.h>
39 #include <linux/iommu.h>
40 #include <linux/intel-iommu.h>
41 #include <linux/syscore_ops.h>
42 #include <linux/tboot.h>
43 #include <linux/dmi.h>
44 #include <linux/pci-ats.h>
45 #include <linux/memblock.h>
46 #include <linux/dma-contiguous.h>
47 #include <linux/crash_dump.h>
48 #include <asm/irq_remapping.h>
49 #include <asm/cacheflush.h>
50 #include <asm/iommu.h>
51
52 #include "irq_remapping.h"
53
54 #define ROOT_SIZE               VTD_PAGE_SIZE
55 #define CONTEXT_SIZE            VTD_PAGE_SIZE
56
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61
62 #define IOAPIC_RANGE_START      (0xfee00000)
63 #define IOAPIC_RANGE_END        (0xfeefffff)
64 #define IOVA_START_ADDR         (0x1000)
65
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
67
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
77                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN          (1)
82
83 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
84 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
85 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
86
87 /* page table handling */
88 #define LEVEL_STRIDE            (9)
89 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
90
91 /*
92  * This bitmap is used to advertise the page sizes our hardware support
93  * to the IOMMU core, which will then use this information to split
94  * physically contiguous memory regions it is mapping into page sizes
95  * that we support.
96  *
97  * Traditionally the IOMMU core just handed us the mappings directly,
98  * after making sure the size is an order of a 4KiB page and that the
99  * mapping has natural alignment.
100  *
101  * To retain this behavior, we currently advertise that we support
102  * all page sizes that are an order of 4KiB.
103  *
104  * If at some point we'd like to utilize the IOMMU core's new behavior,
105  * we could change this to advertise the real page sizes we support.
106  */
107 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
108
109 static inline int agaw_to_level(int agaw)
110 {
111         return agaw + 2;
112 }
113
114 static inline int agaw_to_width(int agaw)
115 {
116         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
117 }
118
119 static inline int width_to_agaw(int width)
120 {
121         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
122 }
123
124 static inline unsigned int level_to_offset_bits(int level)
125 {
126         return (level - 1) * LEVEL_STRIDE;
127 }
128
129 static inline int pfn_level_offset(unsigned long pfn, int level)
130 {
131         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
132 }
133
134 static inline unsigned long level_mask(int level)
135 {
136         return -1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long level_size(int level)
140 {
141         return 1UL << level_to_offset_bits(level);
142 }
143
144 static inline unsigned long align_to_level(unsigned long pfn, int level)
145 {
146         return (pfn + level_size(level) - 1) & level_mask(level);
147 }
148
149 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
150 {
151         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
152 }
153
154 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
155    are never going to work. */
156 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
157 {
158         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160
161 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
162 {
163         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 }
165 static inline unsigned long page_to_dma_pfn(struct page *pg)
166 {
167         return mm_to_dma_pfn(page_to_pfn(pg));
168 }
169 static inline unsigned long virt_to_dma_pfn(void *p)
170 {
171         return page_to_dma_pfn(virt_to_page(p));
172 }
173
174 /* global iommu list, set NULL for ignored DMAR units */
175 static struct intel_iommu **g_iommus;
176
177 static void __init check_tylersburg_isoch(void);
178 static int rwbf_quirk;
179
180 /*
181  * set to 1 to panic kernel if can't successfully enable VT-d
182  * (used when kernel is launched w/ TXT)
183  */
184 static int force_on = 0;
185
186 /*
187  * 0: Present
188  * 1-11: Reserved
189  * 12-63: Context Ptr (12 - (haw-1))
190  * 64-127: Reserved
191  */
192 struct root_entry {
193         u64     lo;
194         u64     hi;
195 };
196 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
197
198 /*
199  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_lctp(struct root_entry *re)
203 {
204         if (!(re->lo & 1))
205                 return 0;
206
207         return re->lo & VTD_PAGE_MASK;
208 }
209
210 /*
211  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
212  * if marked present.
213  */
214 static phys_addr_t root_entry_uctp(struct root_entry *re)
215 {
216         if (!(re->hi & 1))
217                 return 0;
218
219         return re->hi & VTD_PAGE_MASK;
220 }
221 /*
222  * low 64 bits:
223  * 0: present
224  * 1: fault processing disable
225  * 2-3: translation type
226  * 12-63: address space root
227  * high 64 bits:
228  * 0-2: address width
229  * 3-6: aval
230  * 8-23: domain id
231  */
232 struct context_entry {
233         u64 lo;
234         u64 hi;
235 };
236
237 static inline void context_clear_pasid_enable(struct context_entry *context)
238 {
239         context->lo &= ~(1ULL << 11);
240 }
241
242 static inline bool context_pasid_enabled(struct context_entry *context)
243 {
244         return !!(context->lo & (1ULL << 11));
245 }
246
247 static inline void context_set_copied(struct context_entry *context)
248 {
249         context->hi |= (1ull << 3);
250 }
251
252 static inline bool context_copied(struct context_entry *context)
253 {
254         return !!(context->hi & (1ULL << 3));
255 }
256
257 static inline bool __context_present(struct context_entry *context)
258 {
259         return (context->lo & 1);
260 }
261
262 static inline bool context_present(struct context_entry *context)
263 {
264         return context_pasid_enabled(context) ?
265              __context_present(context) :
266              __context_present(context) && !context_copied(context);
267 }
268
269 static inline void context_set_present(struct context_entry *context)
270 {
271         context->lo |= 1;
272 }
273
274 static inline void context_set_fault_enable(struct context_entry *context)
275 {
276         context->lo &= (((u64)-1) << 2) | 1;
277 }
278
279 static inline void context_set_translation_type(struct context_entry *context,
280                                                 unsigned long value)
281 {
282         context->lo &= (((u64)-1) << 4) | 3;
283         context->lo |= (value & 3) << 2;
284 }
285
286 static inline void context_set_address_root(struct context_entry *context,
287                                             unsigned long value)
288 {
289         context->lo &= ~VTD_PAGE_MASK;
290         context->lo |= value & VTD_PAGE_MASK;
291 }
292
293 static inline void context_set_address_width(struct context_entry *context,
294                                              unsigned long value)
295 {
296         context->hi |= value & 7;
297 }
298
299 static inline void context_set_domain_id(struct context_entry *context,
300                                          unsigned long value)
301 {
302         context->hi |= (value & ((1 << 16) - 1)) << 8;
303 }
304
305 static inline int context_domain_id(struct context_entry *c)
306 {
307         return((c->hi >> 8) & 0xffff);
308 }
309
310 static inline void context_clear_entry(struct context_entry *context)
311 {
312         context->lo = 0;
313         context->hi = 0;
314 }
315
316 /*
317  * 0: readable
318  * 1: writable
319  * 2-6: reserved
320  * 7: super page
321  * 8-10: available
322  * 11: snoop behavior
323  * 12-63: Host physcial address
324  */
325 struct dma_pte {
326         u64 val;
327 };
328
329 static inline void dma_clear_pte(struct dma_pte *pte)
330 {
331         pte->val = 0;
332 }
333
334 static inline u64 dma_pte_addr(struct dma_pte *pte)
335 {
336 #ifdef CONFIG_64BIT
337         return pte->val & VTD_PAGE_MASK;
338 #else
339         /* Must have a full atomic 64-bit read */
340         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
341 #endif
342 }
343
344 static inline bool dma_pte_present(struct dma_pte *pte)
345 {
346         return (pte->val & 3) != 0;
347 }
348
349 static inline bool dma_pte_superpage(struct dma_pte *pte)
350 {
351         return (pte->val & DMA_PTE_LARGE_PAGE);
352 }
353
354 static inline int first_pte_in_page(struct dma_pte *pte)
355 {
356         return !((unsigned long)pte & ~VTD_PAGE_MASK);
357 }
358
359 /*
360  * This domain is a statically identity mapping domain.
361  *      1. This domain creats a static 1:1 mapping to all usable memory.
362  *      2. It maps to each iommu if successful.
363  *      3. Each iommu mapps to this domain if successful.
364  */
365 static struct dmar_domain *si_domain;
366 static int hw_pass_through = 1;
367
368 /*
369  * Domain represents a virtual machine, more than one devices
370  * across iommus may be owned in one domain, e.g. kvm guest.
371  */
372 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
373
374 /* si_domain contains mulitple devices */
375 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
376
377 #define for_each_domain_iommu(idx, domain)                      \
378         for (idx = 0; idx < g_num_of_iommus; idx++)             \
379                 if (domain->iommu_refcnt[idx])
380
381 struct dmar_domain {
382         int     nid;                    /* node id */
383
384         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
385                                         /* Refcount of devices per iommu */
386
387
388         u16             iommu_did[DMAR_UNITS_SUPPORTED];
389                                         /* Domain ids per IOMMU. Use u16 since
390                                          * domain ids are 16 bit wide according
391                                          * to VT-d spec, section 9.3 */
392
393         struct list_head devices;       /* all devices' list */
394         struct iova_domain iovad;       /* iova's that belong to this domain */
395
396         struct dma_pte  *pgd;           /* virtual address */
397         int             gaw;            /* max guest address width */
398
399         /* adjusted guest address width, 0 is level 2 30-bit */
400         int             agaw;
401
402         int             flags;          /* flags to find out type of domain */
403
404         int             iommu_coherency;/* indicate coherency of iommu access */
405         int             iommu_snooping; /* indicate snooping control feature*/
406         int             iommu_count;    /* reference count of iommu */
407         int             iommu_superpage;/* Level of superpages supported:
408                                            0 == 4KiB (no superpages), 1 == 2MiB,
409                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
410         u64             max_addr;       /* maximum mapped address */
411
412         struct iommu_domain domain;     /* generic domain data structure for
413                                            iommu core */
414 };
415
416 /* PCI domain-device relationship */
417 struct device_domain_info {
418         struct list_head link;  /* link to domain siblings */
419         struct list_head global; /* link to global list */
420         u8 bus;                 /* PCI bus number */
421         u8 devfn;               /* PCI devfn number */
422         u8 pasid_supported:3;
423         u8 pasid_enabled:1;
424         u8 pri_supported:1;
425         u8 pri_enabled:1;
426         u8 ats_supported:1;
427         u8 ats_enabled:1;
428         u8 ats_qdep;
429         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
430         struct intel_iommu *iommu; /* IOMMU used by this device */
431         struct dmar_domain *domain; /* pointer to domain */
432 };
433
434 struct dmar_rmrr_unit {
435         struct list_head list;          /* list of rmrr units   */
436         struct acpi_dmar_header *hdr;   /* ACPI header          */
437         u64     base_address;           /* reserved base address*/
438         u64     end_address;            /* reserved end address */
439         struct dmar_dev_scope *devices; /* target devices */
440         int     devices_cnt;            /* target device count */
441 };
442
443 struct dmar_atsr_unit {
444         struct list_head list;          /* list of ATSR units */
445         struct acpi_dmar_header *hdr;   /* ACPI header */
446         struct dmar_dev_scope *devices; /* target devices */
447         int devices_cnt;                /* target device count */
448         u8 include_all:1;               /* include all ports */
449 };
450
451 static LIST_HEAD(dmar_atsr_units);
452 static LIST_HEAD(dmar_rmrr_units);
453
454 #define for_each_rmrr_units(rmrr) \
455         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
456
457 static void flush_unmaps_timeout(unsigned long data);
458
459 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
460
461 #define HIGH_WATER_MARK 250
462 struct deferred_flush_tables {
463         int next;
464         struct iova *iova[HIGH_WATER_MARK];
465         struct dmar_domain *domain[HIGH_WATER_MARK];
466         struct page *freelist[HIGH_WATER_MARK];
467 };
468
469 static struct deferred_flush_tables *deferred_flush;
470
471 /* bitmap for indexing intel_iommus */
472 static int g_num_of_iommus;
473
474 static DEFINE_SPINLOCK(async_umap_flush_lock);
475 static LIST_HEAD(unmaps_to_do);
476
477 static int timer_on;
478 static long list_size;
479
480 static void domain_exit(struct dmar_domain *domain);
481 static void domain_remove_dev_info(struct dmar_domain *domain);
482 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
483                                      struct device *dev);
484 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
485 static void domain_context_clear(struct intel_iommu *iommu,
486                                  struct device *dev);
487 static int domain_detach_iommu(struct dmar_domain *domain,
488                                struct intel_iommu *iommu);
489
490 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
491 int dmar_disabled = 0;
492 #else
493 int dmar_disabled = 1;
494 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
495
496 int intel_iommu_enabled = 0;
497 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
498
499 static int dmar_map_gfx = 1;
500 static int dmar_forcedac;
501 static int intel_iommu_strict;
502 static int intel_iommu_superpage = 1;
503 static int intel_iommu_ecs = 1;
504 static int intel_iommu_pasid28;
505 static int iommu_identity_mapping;
506
507 #define IDENTMAP_ALL            1
508 #define IDENTMAP_GFX            2
509 #define IDENTMAP_AZALIA         4
510
511 /* Broadwell and Skylake have broken ECS support — normal so-called "second
512  * level" translation of DMA requests-without-PASID doesn't actually happen
513  * unless you also set the NESTE bit in an extended context-entry. Which of
514  * course means that SVM doesn't work because it's trying to do nested
515  * translation of the physical addresses it finds in the process page tables,
516  * through the IOVA->phys mapping found in the "second level" page tables.
517  *
518  * The VT-d specification was retroactively changed to change the definition
519  * of the capability bits and pretend that Broadwell/Skylake never happened...
520  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
521  * for some reason it was the PASID capability bit which was redefined (from
522  * bit 28 on BDW/SKL to bit 40 in future).
523  *
524  * So our test for ECS needs to eschew those implementations which set the old
525  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
526  * Unless we are working around the 'pasid28' limitations, that is, by putting
527  * the device into passthrough mode for normal DMA and thus masking the bug.
528  */
529 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
530                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
531 /* PASID support is thus enabled if ECS is enabled and *either* of the old
532  * or new capability bits are set. */
533 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
534                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
535
536 int intel_iommu_gfx_mapped;
537 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
538
539 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
540 static DEFINE_SPINLOCK(device_domain_lock);
541 static LIST_HEAD(device_domain_list);
542
543 static const struct iommu_ops intel_iommu_ops;
544
545 static bool translation_pre_enabled(struct intel_iommu *iommu)
546 {
547         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
548 }
549
550 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
551 {
552         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
553 }
554
555 static void init_translation_status(struct intel_iommu *iommu)
556 {
557         u32 gsts;
558
559         gsts = readl(iommu->reg + DMAR_GSTS_REG);
560         if (gsts & DMA_GSTS_TES)
561                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
562 }
563
564 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
565 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
566 {
567         return container_of(dom, struct dmar_domain, domain);
568 }
569
570 static int __init intel_iommu_setup(char *str)
571 {
572         if (!str)
573                 return -EINVAL;
574         while (*str) {
575                 if (!strncmp(str, "on", 2)) {
576                         dmar_disabled = 0;
577                         pr_info("IOMMU enabled\n");
578                 } else if (!strncmp(str, "off", 3)) {
579                         dmar_disabled = 1;
580                         pr_info("IOMMU disabled\n");
581                 } else if (!strncmp(str, "igfx_off", 8)) {
582                         dmar_map_gfx = 0;
583                         pr_info("Disable GFX device mapping\n");
584                 } else if (!strncmp(str, "forcedac", 8)) {
585                         pr_info("Forcing DAC for PCI devices\n");
586                         dmar_forcedac = 1;
587                 } else if (!strncmp(str, "strict", 6)) {
588                         pr_info("Disable batched IOTLB flush\n");
589                         intel_iommu_strict = 1;
590                 } else if (!strncmp(str, "sp_off", 6)) {
591                         pr_info("Disable supported super page\n");
592                         intel_iommu_superpage = 0;
593                 } else if (!strncmp(str, "ecs_off", 7)) {
594                         printk(KERN_INFO
595                                 "Intel-IOMMU: disable extended context table support\n");
596                         intel_iommu_ecs = 0;
597                 } else if (!strncmp(str, "pasid28", 7)) {
598                         printk(KERN_INFO
599                                 "Intel-IOMMU: enable pre-production PASID support\n");
600                         intel_iommu_pasid28 = 1;
601                         iommu_identity_mapping |= IDENTMAP_GFX;
602                 }
603
604                 str += strcspn(str, ",");
605                 while (*str == ',')
606                         str++;
607         }
608         return 0;
609 }
610 __setup("intel_iommu=", intel_iommu_setup);
611
612 static struct kmem_cache *iommu_domain_cache;
613 static struct kmem_cache *iommu_devinfo_cache;
614
615 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
616 {
617         struct dmar_domain **domains;
618         int idx = did >> 8;
619
620         domains = iommu->domains[idx];
621         if (!domains)
622                 return NULL;
623
624         return domains[did & 0xff];
625 }
626
627 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
628                              struct dmar_domain *domain)
629 {
630         struct dmar_domain **domains;
631         int idx = did >> 8;
632
633         if (!iommu->domains[idx]) {
634                 size_t size = 256 * sizeof(struct dmar_domain *);
635                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
636         }
637
638         domains = iommu->domains[idx];
639         if (WARN_ON(!domains))
640                 return;
641         else
642                 domains[did & 0xff] = domain;
643 }
644
645 static inline void *alloc_pgtable_page(int node)
646 {
647         struct page *page;
648         void *vaddr = NULL;
649
650         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
651         if (page)
652                 vaddr = page_address(page);
653         return vaddr;
654 }
655
656 static inline void free_pgtable_page(void *vaddr)
657 {
658         free_page((unsigned long)vaddr);
659 }
660
661 static inline void *alloc_domain_mem(void)
662 {
663         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
664 }
665
666 static void free_domain_mem(void *vaddr)
667 {
668         kmem_cache_free(iommu_domain_cache, vaddr);
669 }
670
671 static inline void * alloc_devinfo_mem(void)
672 {
673         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
674 }
675
676 static inline void free_devinfo_mem(void *vaddr)
677 {
678         kmem_cache_free(iommu_devinfo_cache, vaddr);
679 }
680
681 static inline int domain_type_is_vm(struct dmar_domain *domain)
682 {
683         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
684 }
685
686 static inline int domain_type_is_si(struct dmar_domain *domain)
687 {
688         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
689 }
690
691 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
692 {
693         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
694                                 DOMAIN_FLAG_STATIC_IDENTITY);
695 }
696
697 static inline int domain_pfn_supported(struct dmar_domain *domain,
698                                        unsigned long pfn)
699 {
700         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
701
702         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
703 }
704
705 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
706 {
707         unsigned long sagaw;
708         int agaw = -1;
709
710         sagaw = cap_sagaw(iommu->cap);
711         for (agaw = width_to_agaw(max_gaw);
712              agaw >= 0; agaw--) {
713                 if (test_bit(agaw, &sagaw))
714                         break;
715         }
716
717         return agaw;
718 }
719
720 /*
721  * Calculate max SAGAW for each iommu.
722  */
723 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
724 {
725         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
726 }
727
728 /*
729  * calculate agaw for each iommu.
730  * "SAGAW" may be different across iommus, use a default agaw, and
731  * get a supported less agaw for iommus that don't support the default agaw.
732  */
733 int iommu_calculate_agaw(struct intel_iommu *iommu)
734 {
735         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
736 }
737
738 /* This functionin only returns single iommu in a domain */
739 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
740 {
741         int iommu_id;
742
743         /* si_domain and vm domain should not get here. */
744         BUG_ON(domain_type_is_vm_or_si(domain));
745         for_each_domain_iommu(iommu_id, domain)
746                 break;
747
748         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
749                 return NULL;
750
751         return g_iommus[iommu_id];
752 }
753
754 static void domain_update_iommu_coherency(struct dmar_domain *domain)
755 {
756         struct dmar_drhd_unit *drhd;
757         struct intel_iommu *iommu;
758         bool found = false;
759         int i;
760
761         domain->iommu_coherency = 1;
762
763         for_each_domain_iommu(i, domain) {
764                 found = true;
765                 if (!ecap_coherent(g_iommus[i]->ecap)) {
766                         domain->iommu_coherency = 0;
767                         break;
768                 }
769         }
770         if (found)
771                 return;
772
773         /* No hardware attached; use lowest common denominator */
774         rcu_read_lock();
775         for_each_active_iommu(iommu, drhd) {
776                 if (!ecap_coherent(iommu->ecap)) {
777                         domain->iommu_coherency = 0;
778                         break;
779                 }
780         }
781         rcu_read_unlock();
782 }
783
784 static int domain_update_iommu_snooping(struct intel_iommu *skip)
785 {
786         struct dmar_drhd_unit *drhd;
787         struct intel_iommu *iommu;
788         int ret = 1;
789
790         rcu_read_lock();
791         for_each_active_iommu(iommu, drhd) {
792                 if (iommu != skip) {
793                         if (!ecap_sc_support(iommu->ecap)) {
794                                 ret = 0;
795                                 break;
796                         }
797                 }
798         }
799         rcu_read_unlock();
800
801         return ret;
802 }
803
804 static int domain_update_iommu_superpage(struct intel_iommu *skip)
805 {
806         struct dmar_drhd_unit *drhd;
807         struct intel_iommu *iommu;
808         int mask = 0xf;
809
810         if (!intel_iommu_superpage) {
811                 return 0;
812         }
813
814         /* set iommu_superpage to the smallest common denominator */
815         rcu_read_lock();
816         for_each_active_iommu(iommu, drhd) {
817                 if (iommu != skip) {
818                         mask &= cap_super_page_val(iommu->cap);
819                         if (!mask)
820                                 break;
821                 }
822         }
823         rcu_read_unlock();
824
825         return fls(mask);
826 }
827
828 /* Some capabilities may be different across iommus */
829 static void domain_update_iommu_cap(struct dmar_domain *domain)
830 {
831         domain_update_iommu_coherency(domain);
832         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
833         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
834 }
835
836 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
837                                                        u8 bus, u8 devfn, int alloc)
838 {
839         struct root_entry *root = &iommu->root_entry[bus];
840         struct context_entry *context;
841         u64 *entry;
842
843         entry = &root->lo;
844         if (ecs_enabled(iommu)) {
845                 if (devfn >= 0x80) {
846                         devfn -= 0x80;
847                         entry = &root->hi;
848                 }
849                 devfn *= 2;
850         }
851         if (*entry & 1)
852                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
853         else {
854                 unsigned long phy_addr;
855                 if (!alloc)
856                         return NULL;
857
858                 context = alloc_pgtable_page(iommu->node);
859                 if (!context)
860                         return NULL;
861
862                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
863                 phy_addr = virt_to_phys((void *)context);
864                 *entry = phy_addr | 1;
865                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
866         }
867         return &context[devfn];
868 }
869
870 static int iommu_dummy(struct device *dev)
871 {
872         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
873 }
874
875 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
876 {
877         struct dmar_drhd_unit *drhd = NULL;
878         struct intel_iommu *iommu;
879         struct device *tmp;
880         struct pci_dev *ptmp, *pdev = NULL;
881         u16 segment = 0;
882         int i;
883
884         if (iommu_dummy(dev))
885                 return NULL;
886
887         if (dev_is_pci(dev)) {
888                 pdev = to_pci_dev(dev);
889                 segment = pci_domain_nr(pdev->bus);
890         } else if (has_acpi_companion(dev))
891                 dev = &ACPI_COMPANION(dev)->dev;
892
893         rcu_read_lock();
894         for_each_active_iommu(iommu, drhd) {
895                 if (pdev && segment != drhd->segment)
896                         continue;
897
898                 for_each_active_dev_scope(drhd->devices,
899                                           drhd->devices_cnt, i, tmp) {
900                         if (tmp == dev) {
901                                 *bus = drhd->devices[i].bus;
902                                 *devfn = drhd->devices[i].devfn;
903                                 goto out;
904                         }
905
906                         if (!pdev || !dev_is_pci(tmp))
907                                 continue;
908
909                         ptmp = to_pci_dev(tmp);
910                         if (ptmp->subordinate &&
911                             ptmp->subordinate->number <= pdev->bus->number &&
912                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
913                                 goto got_pdev;
914                 }
915
916                 if (pdev && drhd->include_all) {
917                 got_pdev:
918                         *bus = pdev->bus->number;
919                         *devfn = pdev->devfn;
920                         goto out;
921                 }
922         }
923         iommu = NULL;
924  out:
925         rcu_read_unlock();
926
927         return iommu;
928 }
929
930 static void domain_flush_cache(struct dmar_domain *domain,
931                                void *addr, int size)
932 {
933         if (!domain->iommu_coherency)
934                 clflush_cache_range(addr, size);
935 }
936
937 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
938 {
939         struct context_entry *context;
940         int ret = 0;
941         unsigned long flags;
942
943         spin_lock_irqsave(&iommu->lock, flags);
944         context = iommu_context_addr(iommu, bus, devfn, 0);
945         if (context)
946                 ret = context_present(context);
947         spin_unlock_irqrestore(&iommu->lock, flags);
948         return ret;
949 }
950
951 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
952 {
953         struct context_entry *context;
954         unsigned long flags;
955
956         spin_lock_irqsave(&iommu->lock, flags);
957         context = iommu_context_addr(iommu, bus, devfn, 0);
958         if (context) {
959                 context_clear_entry(context);
960                 __iommu_flush_cache(iommu, context, sizeof(*context));
961         }
962         spin_unlock_irqrestore(&iommu->lock, flags);
963 }
964
965 static void free_context_table(struct intel_iommu *iommu)
966 {
967         int i;
968         unsigned long flags;
969         struct context_entry *context;
970
971         spin_lock_irqsave(&iommu->lock, flags);
972         if (!iommu->root_entry) {
973                 goto out;
974         }
975         for (i = 0; i < ROOT_ENTRY_NR; i++) {
976                 context = iommu_context_addr(iommu, i, 0, 0);
977                 if (context)
978                         free_pgtable_page(context);
979
980                 if (!ecs_enabled(iommu))
981                         continue;
982
983                 context = iommu_context_addr(iommu, i, 0x80, 0);
984                 if (context)
985                         free_pgtable_page(context);
986
987         }
988         free_pgtable_page(iommu->root_entry);
989         iommu->root_entry = NULL;
990 out:
991         spin_unlock_irqrestore(&iommu->lock, flags);
992 }
993
994 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
995                                       unsigned long pfn, int *target_level)
996 {
997         struct dma_pte *parent, *pte = NULL;
998         int level = agaw_to_level(domain->agaw);
999         int offset;
1000
1001         BUG_ON(!domain->pgd);
1002
1003         if (!domain_pfn_supported(domain, pfn))
1004                 /* Address beyond IOMMU's addressing capabilities. */
1005                 return NULL;
1006
1007         parent = domain->pgd;
1008
1009         while (1) {
1010                 void *tmp_page;
1011
1012                 offset = pfn_level_offset(pfn, level);
1013                 pte = &parent[offset];
1014                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1015                         break;
1016                 if (level == *target_level)
1017                         break;
1018
1019                 if (!dma_pte_present(pte)) {
1020                         uint64_t pteval;
1021
1022                         tmp_page = alloc_pgtable_page(domain->nid);
1023
1024                         if (!tmp_page)
1025                                 return NULL;
1026
1027                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1028                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1029                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1030                                 /* Someone else set it while we were thinking; use theirs. */
1031                                 free_pgtable_page(tmp_page);
1032                         else
1033                                 domain_flush_cache(domain, pte, sizeof(*pte));
1034                 }
1035                 if (level == 1)
1036                         break;
1037
1038                 parent = phys_to_virt(dma_pte_addr(pte));
1039                 level--;
1040         }
1041
1042         if (!*target_level)
1043                 *target_level = level;
1044
1045         return pte;
1046 }
1047
1048
1049 /* return address's pte at specific level */
1050 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1051                                          unsigned long pfn,
1052                                          int level, int *large_page)
1053 {
1054         struct dma_pte *parent, *pte = NULL;
1055         int total = agaw_to_level(domain->agaw);
1056         int offset;
1057
1058         parent = domain->pgd;
1059         while (level <= total) {
1060                 offset = pfn_level_offset(pfn, total);
1061                 pte = &parent[offset];
1062                 if (level == total)
1063                         return pte;
1064
1065                 if (!dma_pte_present(pte)) {
1066                         *large_page = total;
1067                         break;
1068                 }
1069
1070                 if (dma_pte_superpage(pte)) {
1071                         *large_page = total;
1072                         return pte;
1073                 }
1074
1075                 parent = phys_to_virt(dma_pte_addr(pte));
1076                 total--;
1077         }
1078         return NULL;
1079 }
1080
1081 /* clear last level pte, a tlb flush should be followed */
1082 static void dma_pte_clear_range(struct dmar_domain *domain,
1083                                 unsigned long start_pfn,
1084                                 unsigned long last_pfn)
1085 {
1086         unsigned int large_page = 1;
1087         struct dma_pte *first_pte, *pte;
1088
1089         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1090         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1091         BUG_ON(start_pfn > last_pfn);
1092
1093         /* we don't need lock here; nobody else touches the iova range */
1094         do {
1095                 large_page = 1;
1096                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1097                 if (!pte) {
1098                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1099                         continue;
1100                 }
1101                 do {
1102                         dma_clear_pte(pte);
1103                         start_pfn += lvl_to_nr_pages(large_page);
1104                         pte++;
1105                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1106
1107                 domain_flush_cache(domain, first_pte,
1108                                    (void *)pte - (void *)first_pte);
1109
1110         } while (start_pfn && start_pfn <= last_pfn);
1111 }
1112
1113 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1114                                struct dma_pte *pte, unsigned long pfn,
1115                                unsigned long start_pfn, unsigned long last_pfn)
1116 {
1117         pfn = max(start_pfn, pfn);
1118         pte = &pte[pfn_level_offset(pfn, level)];
1119
1120         do {
1121                 unsigned long level_pfn;
1122                 struct dma_pte *level_pte;
1123
1124                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1125                         goto next;
1126
1127                 level_pfn = pfn & level_mask(level - 1);
1128                 level_pte = phys_to_virt(dma_pte_addr(pte));
1129
1130                 if (level > 2)
1131                         dma_pte_free_level(domain, level - 1, level_pte,
1132                                            level_pfn, start_pfn, last_pfn);
1133
1134                 /* If range covers entire pagetable, free it */
1135                 if (!(start_pfn > level_pfn ||
1136                       last_pfn < level_pfn + level_size(level) - 1)) {
1137                         dma_clear_pte(pte);
1138                         domain_flush_cache(domain, pte, sizeof(*pte));
1139                         free_pgtable_page(level_pte);
1140                 }
1141 next:
1142                 pfn += level_size(level);
1143         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144 }
1145
1146 /* free page table pages. last level pte should already be cleared */
1147 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1148                                    unsigned long start_pfn,
1149                                    unsigned long last_pfn)
1150 {
1151         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1152         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1153         BUG_ON(start_pfn > last_pfn);
1154
1155         dma_pte_clear_range(domain, start_pfn, last_pfn);
1156
1157         /* We don't need lock here; nobody else touches the iova range */
1158         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1159                            domain->pgd, 0, start_pfn, last_pfn);
1160
1161         /* free pgd */
1162         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1163                 free_pgtable_page(domain->pgd);
1164                 domain->pgd = NULL;
1165         }
1166 }
1167
1168 /* When a page at a given level is being unlinked from its parent, we don't
1169    need to *modify* it at all. All we need to do is make a list of all the
1170    pages which can be freed just as soon as we've flushed the IOTLB and we
1171    know the hardware page-walk will no longer touch them.
1172    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1173    be freed. */
1174 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1175                                             int level, struct dma_pte *pte,
1176                                             struct page *freelist)
1177 {
1178         struct page *pg;
1179
1180         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1181         pg->freelist = freelist;
1182         freelist = pg;
1183
1184         if (level == 1)
1185                 return freelist;
1186
1187         pte = page_address(pg);
1188         do {
1189                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1190                         freelist = dma_pte_list_pagetables(domain, level - 1,
1191                                                            pte, freelist);
1192                 pte++;
1193         } while (!first_pte_in_page(pte));
1194
1195         return freelist;
1196 }
1197
1198 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1199                                         struct dma_pte *pte, unsigned long pfn,
1200                                         unsigned long start_pfn,
1201                                         unsigned long last_pfn,
1202                                         struct page *freelist)
1203 {
1204         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1205
1206         pfn = max(start_pfn, pfn);
1207         pte = &pte[pfn_level_offset(pfn, level)];
1208
1209         do {
1210                 unsigned long level_pfn;
1211
1212                 if (!dma_pte_present(pte))
1213                         goto next;
1214
1215                 level_pfn = pfn & level_mask(level);
1216
1217                 /* If range covers entire pagetable, free it */
1218                 if (start_pfn <= level_pfn &&
1219                     last_pfn >= level_pfn + level_size(level) - 1) {
1220                         /* These suborbinate page tables are going away entirely. Don't
1221                            bother to clear them; we're just going to *free* them. */
1222                         if (level > 1 && !dma_pte_superpage(pte))
1223                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1224
1225                         dma_clear_pte(pte);
1226                         if (!first_pte)
1227                                 first_pte = pte;
1228                         last_pte = pte;
1229                 } else if (level > 1) {
1230                         /* Recurse down into a level that isn't *entirely* obsolete */
1231                         freelist = dma_pte_clear_level(domain, level - 1,
1232                                                        phys_to_virt(dma_pte_addr(pte)),
1233                                                        level_pfn, start_pfn, last_pfn,
1234                                                        freelist);
1235                 }
1236 next:
1237                 pfn += level_size(level);
1238         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1239
1240         if (first_pte)
1241                 domain_flush_cache(domain, first_pte,
1242                                    (void *)++last_pte - (void *)first_pte);
1243
1244         return freelist;
1245 }
1246
1247 /* We can't just free the pages because the IOMMU may still be walking
1248    the page tables, and may have cached the intermediate levels. The
1249    pages can only be freed after the IOTLB flush has been done. */
1250 static struct page *domain_unmap(struct dmar_domain *domain,
1251                                  unsigned long start_pfn,
1252                                  unsigned long last_pfn)
1253 {
1254         struct page *freelist = NULL;
1255
1256         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1257         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1258         BUG_ON(start_pfn > last_pfn);
1259
1260         /* we don't need lock here; nobody else touches the iova range */
1261         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1262                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1263
1264         /* free pgd */
1265         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1266                 struct page *pgd_page = virt_to_page(domain->pgd);
1267                 pgd_page->freelist = freelist;
1268                 freelist = pgd_page;
1269
1270                 domain->pgd = NULL;
1271         }
1272
1273         return freelist;
1274 }
1275
1276 static void dma_free_pagelist(struct page *freelist)
1277 {
1278         struct page *pg;
1279
1280         while ((pg = freelist)) {
1281                 freelist = pg->freelist;
1282                 free_pgtable_page(page_address(pg));
1283         }
1284 }
1285
1286 /* iommu handling */
1287 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1288 {
1289         struct root_entry *root;
1290         unsigned long flags;
1291
1292         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1293         if (!root) {
1294                 pr_err("Allocating root entry for %s failed\n",
1295                         iommu->name);
1296                 return -ENOMEM;
1297         }
1298
1299         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1300
1301         spin_lock_irqsave(&iommu->lock, flags);
1302         iommu->root_entry = root;
1303         spin_unlock_irqrestore(&iommu->lock, flags);
1304
1305         return 0;
1306 }
1307
1308 static void iommu_set_root_entry(struct intel_iommu *iommu)
1309 {
1310         u64 addr;
1311         u32 sts;
1312         unsigned long flag;
1313
1314         addr = virt_to_phys(iommu->root_entry);
1315         if (ecs_enabled(iommu))
1316                 addr |= DMA_RTADDR_RTT;
1317
1318         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1319         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1320
1321         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1322
1323         /* Make sure hardware complete it */
1324         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1325                       readl, (sts & DMA_GSTS_RTPS), sts);
1326
1327         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1328 }
1329
1330 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1331 {
1332         u32 val;
1333         unsigned long flag;
1334
1335         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1336                 return;
1337
1338         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1340
1341         /* Make sure hardware complete it */
1342         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1343                       readl, (!(val & DMA_GSTS_WBFS)), val);
1344
1345         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346 }
1347
1348 /* return value determine if we need a write buffer flush */
1349 static void __iommu_flush_context(struct intel_iommu *iommu,
1350                                   u16 did, u16 source_id, u8 function_mask,
1351                                   u64 type)
1352 {
1353         u64 val = 0;
1354         unsigned long flag;
1355
1356         switch (type) {
1357         case DMA_CCMD_GLOBAL_INVL:
1358                 val = DMA_CCMD_GLOBAL_INVL;
1359                 break;
1360         case DMA_CCMD_DOMAIN_INVL:
1361                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1362                 break;
1363         case DMA_CCMD_DEVICE_INVL:
1364                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1365                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1366                 break;
1367         default:
1368                 BUG();
1369         }
1370         val |= DMA_CCMD_ICC;
1371
1372         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1373         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1374
1375         /* Make sure hardware complete it */
1376         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1377                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1378
1379         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1380 }
1381
1382 /* return value determine if we need a write buffer flush */
1383 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1384                                 u64 addr, unsigned int size_order, u64 type)
1385 {
1386         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1387         u64 val = 0, val_iva = 0;
1388         unsigned long flag;
1389
1390         switch (type) {
1391         case DMA_TLB_GLOBAL_FLUSH:
1392                 /* global flush doesn't need set IVA_REG */
1393                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1394                 break;
1395         case DMA_TLB_DSI_FLUSH:
1396                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1397                 break;
1398         case DMA_TLB_PSI_FLUSH:
1399                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1400                 /* IH bit is passed in as part of address */
1401                 val_iva = size_order | addr;
1402                 break;
1403         default:
1404                 BUG();
1405         }
1406         /* Note: set drain read/write */
1407 #if 0
1408         /*
1409          * This is probably to be super secure.. Looks like we can
1410          * ignore it without any impact.
1411          */
1412         if (cap_read_drain(iommu->cap))
1413                 val |= DMA_TLB_READ_DRAIN;
1414 #endif
1415         if (cap_write_drain(iommu->cap))
1416                 val |= DMA_TLB_WRITE_DRAIN;
1417
1418         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1419         /* Note: Only uses first TLB reg currently */
1420         if (val_iva)
1421                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1422         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1423
1424         /* Make sure hardware complete it */
1425         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1426                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1427
1428         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1429
1430         /* check IOTLB invalidation granularity */
1431         if (DMA_TLB_IAIG(val) == 0)
1432                 pr_err("Flush IOTLB failed\n");
1433         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1434                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1435                         (unsigned long long)DMA_TLB_IIRG(type),
1436                         (unsigned long long)DMA_TLB_IAIG(val));
1437 }
1438
1439 static struct device_domain_info *
1440 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1441                          u8 bus, u8 devfn)
1442 {
1443         struct device_domain_info *info;
1444
1445         assert_spin_locked(&device_domain_lock);
1446
1447         if (!iommu->qi)
1448                 return NULL;
1449
1450         list_for_each_entry(info, &domain->devices, link)
1451                 if (info->iommu == iommu && info->bus == bus &&
1452                     info->devfn == devfn) {
1453                         if (info->ats_supported && info->dev)
1454                                 return info;
1455                         break;
1456                 }
1457
1458         return NULL;
1459 }
1460
1461 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1462 {
1463         struct pci_dev *pdev;
1464
1465         if (!info || !dev_is_pci(info->dev))
1466                 return;
1467
1468         pdev = to_pci_dev(info->dev);
1469
1470 #ifdef CONFIG_INTEL_IOMMU_SVM
1471         /* The PCIe spec, in its wisdom, declares that the behaviour of
1472            the device if you enable PASID support after ATS support is
1473            undefined. So always enable PASID support on devices which
1474            have it, even if we can't yet know if we're ever going to
1475            use it. */
1476         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1477                 info->pasid_enabled = 1;
1478
1479         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1480                 info->pri_enabled = 1;
1481 #endif
1482         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1483                 info->ats_enabled = 1;
1484                 info->ats_qdep = pci_ats_queue_depth(pdev);
1485         }
1486 }
1487
1488 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1489 {
1490         struct pci_dev *pdev;
1491
1492         if (!dev_is_pci(info->dev))
1493                 return;
1494
1495         pdev = to_pci_dev(info->dev);
1496
1497         if (info->ats_enabled) {
1498                 pci_disable_ats(pdev);
1499                 info->ats_enabled = 0;
1500         }
1501 #ifdef CONFIG_INTEL_IOMMU_SVM
1502         if (info->pri_enabled) {
1503                 pci_disable_pri(pdev);
1504                 info->pri_enabled = 0;
1505         }
1506         if (info->pasid_enabled) {
1507                 pci_disable_pasid(pdev);
1508                 info->pasid_enabled = 0;
1509         }
1510 #endif
1511 }
1512
1513 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1514                                   u64 addr, unsigned mask)
1515 {
1516         u16 sid, qdep;
1517         unsigned long flags;
1518         struct device_domain_info *info;
1519
1520         spin_lock_irqsave(&device_domain_lock, flags);
1521         list_for_each_entry(info, &domain->devices, link) {
1522                 if (!info->ats_enabled)
1523                         continue;
1524
1525                 sid = info->bus << 8 | info->devfn;
1526                 qdep = info->ats_qdep;
1527                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1528         }
1529         spin_unlock_irqrestore(&device_domain_lock, flags);
1530 }
1531
1532 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1533                                   struct dmar_domain *domain,
1534                                   unsigned long pfn, unsigned int pages,
1535                                   int ih, int map)
1536 {
1537         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1538         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1539         u16 did = domain->iommu_did[iommu->seq_id];
1540
1541         BUG_ON(pages == 0);
1542
1543         if (ih)
1544                 ih = 1 << 6;
1545         /*
1546          * Fallback to domain selective flush if no PSI support or the size is
1547          * too big.
1548          * PSI requires page size to be 2 ^ x, and the base address is naturally
1549          * aligned to the size
1550          */
1551         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1552                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1553                                                 DMA_TLB_DSI_FLUSH);
1554         else
1555                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1556                                                 DMA_TLB_PSI_FLUSH);
1557
1558         /*
1559          * In caching mode, changes of pages from non-present to present require
1560          * flush. However, device IOTLB doesn't need to be flushed in this case.
1561          */
1562         if (!cap_caching_mode(iommu->cap) || !map)
1563                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1564                                       addr, mask);
1565 }
1566
1567 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1568 {
1569         u32 pmen;
1570         unsigned long flags;
1571
1572         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1573         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1574         pmen &= ~DMA_PMEN_EPM;
1575         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1576
1577         /* wait for the protected region status bit to clear */
1578         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1579                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1580
1581         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1582 }
1583
1584 static void iommu_enable_translation(struct intel_iommu *iommu)
1585 {
1586         u32 sts;
1587         unsigned long flags;
1588
1589         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1590         iommu->gcmd |= DMA_GCMD_TE;
1591         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1592
1593         /* Make sure hardware complete it */
1594         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1595                       readl, (sts & DMA_GSTS_TES), sts);
1596
1597         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1598 }
1599
1600 static void iommu_disable_translation(struct intel_iommu *iommu)
1601 {
1602         u32 sts;
1603         unsigned long flag;
1604
1605         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1606         iommu->gcmd &= ~DMA_GCMD_TE;
1607         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1608
1609         /* Make sure hardware complete it */
1610         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1611                       readl, (!(sts & DMA_GSTS_TES)), sts);
1612
1613         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1614 }
1615
1616
1617 static int iommu_init_domains(struct intel_iommu *iommu)
1618 {
1619         u32 ndomains, nlongs;
1620         size_t size;
1621
1622         ndomains = cap_ndoms(iommu->cap);
1623         pr_debug("%s: Number of Domains supported <%d>\n",
1624                  iommu->name, ndomains);
1625         nlongs = BITS_TO_LONGS(ndomains);
1626
1627         spin_lock_init(&iommu->lock);
1628
1629         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1630         if (!iommu->domain_ids) {
1631                 pr_err("%s: Allocating domain id array failed\n",
1632                        iommu->name);
1633                 return -ENOMEM;
1634         }
1635
1636         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1637         iommu->domains = kzalloc(size, GFP_KERNEL);
1638
1639         if (iommu->domains) {
1640                 size = 256 * sizeof(struct dmar_domain *);
1641                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1642         }
1643
1644         if (!iommu->domains || !iommu->domains[0]) {
1645                 pr_err("%s: Allocating domain array failed\n",
1646                        iommu->name);
1647                 kfree(iommu->domain_ids);
1648                 kfree(iommu->domains);
1649                 iommu->domain_ids = NULL;
1650                 iommu->domains    = NULL;
1651                 return -ENOMEM;
1652         }
1653
1654
1655
1656         /*
1657          * If Caching mode is set, then invalid translations are tagged
1658          * with domain-id 0, hence we need to pre-allocate it. We also
1659          * use domain-id 0 as a marker for non-allocated domain-id, so
1660          * make sure it is not used for a real domain.
1661          */
1662         set_bit(0, iommu->domain_ids);
1663
1664         return 0;
1665 }
1666
1667 static void disable_dmar_iommu(struct intel_iommu *iommu)
1668 {
1669         struct device_domain_info *info, *tmp;
1670         unsigned long flags;
1671
1672         if (!iommu->domains || !iommu->domain_ids)
1673                 return;
1674
1675 again:
1676         spin_lock_irqsave(&device_domain_lock, flags);
1677         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1678                 struct dmar_domain *domain;
1679
1680                 if (info->iommu != iommu)
1681                         continue;
1682
1683                 if (!info->dev || !info->domain)
1684                         continue;
1685
1686                 domain = info->domain;
1687
1688                 __dmar_remove_one_dev_info(info);
1689
1690                 if (!domain_type_is_vm_or_si(domain)) {
1691                         /*
1692                          * The domain_exit() function  can't be called under
1693                          * device_domain_lock, as it takes this lock itself.
1694                          * So release the lock here and re-run the loop
1695                          * afterwards.
1696                          */
1697                         spin_unlock_irqrestore(&device_domain_lock, flags);
1698                         domain_exit(domain);
1699                         goto again;
1700                 }
1701         }
1702         spin_unlock_irqrestore(&device_domain_lock, flags);
1703
1704         if (iommu->gcmd & DMA_GCMD_TE)
1705                 iommu_disable_translation(iommu);
1706 }
1707
1708 static void free_dmar_iommu(struct intel_iommu *iommu)
1709 {
1710         if ((iommu->domains) && (iommu->domain_ids)) {
1711                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1712                 int i;
1713
1714                 for (i = 0; i < elems; i++)
1715                         kfree(iommu->domains[i]);
1716                 kfree(iommu->domains);
1717                 kfree(iommu->domain_ids);
1718                 iommu->domains = NULL;
1719                 iommu->domain_ids = NULL;
1720         }
1721
1722         g_iommus[iommu->seq_id] = NULL;
1723
1724         /* free context mapping */
1725         free_context_table(iommu);
1726
1727 #ifdef CONFIG_INTEL_IOMMU_SVM
1728         if (pasid_enabled(iommu)) {
1729                 if (ecap_prs(iommu->ecap))
1730                         intel_svm_finish_prq(iommu);
1731                 intel_svm_free_pasid_tables(iommu);
1732         }
1733 #endif
1734 }
1735
1736 static struct dmar_domain *alloc_domain(int flags)
1737 {
1738         struct dmar_domain *domain;
1739
1740         domain = alloc_domain_mem();
1741         if (!domain)
1742                 return NULL;
1743
1744         memset(domain, 0, sizeof(*domain));
1745         domain->nid = -1;
1746         domain->flags = flags;
1747         INIT_LIST_HEAD(&domain->devices);
1748
1749         return domain;
1750 }
1751
1752 /* Must be called with iommu->lock */
1753 static int domain_attach_iommu(struct dmar_domain *domain,
1754                                struct intel_iommu *iommu)
1755 {
1756         unsigned long ndomains;
1757         int num;
1758
1759         assert_spin_locked(&device_domain_lock);
1760         assert_spin_locked(&iommu->lock);
1761
1762         domain->iommu_refcnt[iommu->seq_id] += 1;
1763         domain->iommu_count += 1;
1764         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1765                 ndomains = cap_ndoms(iommu->cap);
1766                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1767
1768                 if (num >= ndomains) {
1769                         pr_err("%s: No free domain ids\n", iommu->name);
1770                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1771                         domain->iommu_count -= 1;
1772                         return -ENOSPC;
1773                 }
1774
1775                 set_bit(num, iommu->domain_ids);
1776                 set_iommu_domain(iommu, num, domain);
1777
1778                 domain->iommu_did[iommu->seq_id] = num;
1779                 domain->nid                      = iommu->node;
1780
1781                 domain_update_iommu_cap(domain);
1782         }
1783
1784         return 0;
1785 }
1786
1787 static int domain_detach_iommu(struct dmar_domain *domain,
1788                                struct intel_iommu *iommu)
1789 {
1790         int num, count = INT_MAX;
1791
1792         assert_spin_locked(&device_domain_lock);
1793         assert_spin_locked(&iommu->lock);
1794
1795         domain->iommu_refcnt[iommu->seq_id] -= 1;
1796         count = --domain->iommu_count;
1797         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1798                 num = domain->iommu_did[iommu->seq_id];
1799                 clear_bit(num, iommu->domain_ids);
1800                 set_iommu_domain(iommu, num, NULL);
1801
1802                 domain_update_iommu_cap(domain);
1803                 domain->iommu_did[iommu->seq_id] = 0;
1804         }
1805
1806         return count;
1807 }
1808
1809 static struct iova_domain reserved_iova_list;
1810 static struct lock_class_key reserved_rbtree_key;
1811
1812 static int dmar_init_reserved_ranges(void)
1813 {
1814         struct pci_dev *pdev = NULL;
1815         struct iova *iova;
1816         int i;
1817
1818         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1819                         DMA_32BIT_PFN);
1820
1821         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1822                 &reserved_rbtree_key);
1823
1824         /* IOAPIC ranges shouldn't be accessed by DMA */
1825         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1826                 IOVA_PFN(IOAPIC_RANGE_END));
1827         if (!iova) {
1828                 pr_err("Reserve IOAPIC range failed\n");
1829                 return -ENODEV;
1830         }
1831
1832         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1833         for_each_pci_dev(pdev) {
1834                 struct resource *r;
1835
1836                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1837                         r = &pdev->resource[i];
1838                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1839                                 continue;
1840                         iova = reserve_iova(&reserved_iova_list,
1841                                             IOVA_PFN(r->start),
1842                                             IOVA_PFN(r->end));
1843                         if (!iova) {
1844                                 pr_err("Reserve iova failed\n");
1845                                 return -ENODEV;
1846                         }
1847                 }
1848         }
1849         return 0;
1850 }
1851
1852 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1853 {
1854         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1855 }
1856
1857 static inline int guestwidth_to_adjustwidth(int gaw)
1858 {
1859         int agaw;
1860         int r = (gaw - 12) % 9;
1861
1862         if (r == 0)
1863                 agaw = gaw;
1864         else
1865                 agaw = gaw + 9 - r;
1866         if (agaw > 64)
1867                 agaw = 64;
1868         return agaw;
1869 }
1870
1871 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1872                        int guest_width)
1873 {
1874         int adjust_width, agaw;
1875         unsigned long sagaw;
1876
1877         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1878                         DMA_32BIT_PFN);
1879         domain_reserve_special_ranges(domain);
1880
1881         /* calculate AGAW */
1882         if (guest_width > cap_mgaw(iommu->cap))
1883                 guest_width = cap_mgaw(iommu->cap);
1884         domain->gaw = guest_width;
1885         adjust_width = guestwidth_to_adjustwidth(guest_width);
1886         agaw = width_to_agaw(adjust_width);
1887         sagaw = cap_sagaw(iommu->cap);
1888         if (!test_bit(agaw, &sagaw)) {
1889                 /* hardware doesn't support it, choose a bigger one */
1890                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1891                 agaw = find_next_bit(&sagaw, 5, agaw);
1892                 if (agaw >= 5)
1893                         return -ENODEV;
1894         }
1895         domain->agaw = agaw;
1896
1897         if (ecap_coherent(iommu->ecap))
1898                 domain->iommu_coherency = 1;
1899         else
1900                 domain->iommu_coherency = 0;
1901
1902         if (ecap_sc_support(iommu->ecap))
1903                 domain->iommu_snooping = 1;
1904         else
1905                 domain->iommu_snooping = 0;
1906
1907         if (intel_iommu_superpage)
1908                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1909         else
1910                 domain->iommu_superpage = 0;
1911
1912         domain->nid = iommu->node;
1913
1914         /* always allocate the top pgd */
1915         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1916         if (!domain->pgd)
1917                 return -ENOMEM;
1918         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1919         return 0;
1920 }
1921
1922 static void domain_exit(struct dmar_domain *domain)
1923 {
1924         struct page *freelist = NULL;
1925
1926         /* Domain 0 is reserved, so dont process it */
1927         if (!domain)
1928                 return;
1929
1930         /* Flush any lazy unmaps that may reference this domain */
1931         if (!intel_iommu_strict)
1932                 flush_unmaps_timeout(0);
1933
1934         /* Remove associated devices and clear attached or cached domains */
1935         rcu_read_lock();
1936         domain_remove_dev_info(domain);
1937         rcu_read_unlock();
1938
1939         /* destroy iovas */
1940         put_iova_domain(&domain->iovad);
1941
1942         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1943
1944         dma_free_pagelist(freelist);
1945
1946         free_domain_mem(domain);
1947 }
1948
1949 static int domain_context_mapping_one(struct dmar_domain *domain,
1950                                       struct intel_iommu *iommu,
1951                                       u8 bus, u8 devfn)
1952 {
1953         u16 did = domain->iommu_did[iommu->seq_id];
1954         int translation = CONTEXT_TT_MULTI_LEVEL;
1955         struct device_domain_info *info = NULL;
1956         struct context_entry *context;
1957         unsigned long flags;
1958         struct dma_pte *pgd;
1959         int ret, agaw;
1960
1961         WARN_ON(did == 0);
1962
1963         if (hw_pass_through && domain_type_is_si(domain))
1964                 translation = CONTEXT_TT_PASS_THROUGH;
1965
1966         pr_debug("Set context mapping for %02x:%02x.%d\n",
1967                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1968
1969         BUG_ON(!domain->pgd);
1970
1971         spin_lock_irqsave(&device_domain_lock, flags);
1972         spin_lock(&iommu->lock);
1973
1974         ret = -ENOMEM;
1975         context = iommu_context_addr(iommu, bus, devfn, 1);
1976         if (!context)
1977                 goto out_unlock;
1978
1979         ret = 0;
1980         if (context_present(context))
1981                 goto out_unlock;
1982
1983         pgd = domain->pgd;
1984
1985         context_clear_entry(context);
1986         context_set_domain_id(context, did);
1987
1988         /*
1989          * Skip top levels of page tables for iommu which has less agaw
1990          * than default.  Unnecessary for PT mode.
1991          */
1992         if (translation != CONTEXT_TT_PASS_THROUGH) {
1993                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1994                         ret = -ENOMEM;
1995                         pgd = phys_to_virt(dma_pte_addr(pgd));
1996                         if (!dma_pte_present(pgd))
1997                                 goto out_unlock;
1998                 }
1999
2000                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2001                 if (info && info->ats_supported)
2002                         translation = CONTEXT_TT_DEV_IOTLB;
2003                 else
2004                         translation = CONTEXT_TT_MULTI_LEVEL;
2005
2006                 context_set_address_root(context, virt_to_phys(pgd));
2007                 context_set_address_width(context, iommu->agaw);
2008         } else {
2009                 /*
2010                  * In pass through mode, AW must be programmed to
2011                  * indicate the largest AGAW value supported by
2012                  * hardware. And ASR is ignored by hardware.
2013                  */
2014                 context_set_address_width(context, iommu->msagaw);
2015         }
2016
2017         context_set_translation_type(context, translation);
2018         context_set_fault_enable(context);
2019         context_set_present(context);
2020         domain_flush_cache(domain, context, sizeof(*context));
2021
2022         /*
2023          * It's a non-present to present mapping. If hardware doesn't cache
2024          * non-present entry we only need to flush the write-buffer. If the
2025          * _does_ cache non-present entries, then it does so in the special
2026          * domain #0, which we have to flush:
2027          */
2028         if (cap_caching_mode(iommu->cap)) {
2029                 iommu->flush.flush_context(iommu, 0,
2030                                            (((u16)bus) << 8) | devfn,
2031                                            DMA_CCMD_MASK_NOBIT,
2032                                            DMA_CCMD_DEVICE_INVL);
2033                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2034         } else {
2035                 iommu_flush_write_buffer(iommu);
2036         }
2037         iommu_enable_dev_iotlb(info);
2038
2039         ret = 0;
2040
2041 out_unlock:
2042         spin_unlock(&iommu->lock);
2043         spin_unlock_irqrestore(&device_domain_lock, flags);
2044
2045         return ret;
2046 }
2047
2048 struct domain_context_mapping_data {
2049         struct dmar_domain *domain;
2050         struct intel_iommu *iommu;
2051 };
2052
2053 static int domain_context_mapping_cb(struct pci_dev *pdev,
2054                                      u16 alias, void *opaque)
2055 {
2056         struct domain_context_mapping_data *data = opaque;
2057
2058         return domain_context_mapping_one(data->domain, data->iommu,
2059                                           PCI_BUS_NUM(alias), alias & 0xff);
2060 }
2061
2062 static int
2063 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2064 {
2065         struct intel_iommu *iommu;
2066         u8 bus, devfn;
2067         struct domain_context_mapping_data data;
2068
2069         iommu = device_to_iommu(dev, &bus, &devfn);
2070         if (!iommu)
2071                 return -ENODEV;
2072
2073         if (!dev_is_pci(dev))
2074                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2075
2076         data.domain = domain;
2077         data.iommu = iommu;
2078
2079         return pci_for_each_dma_alias(to_pci_dev(dev),
2080                                       &domain_context_mapping_cb, &data);
2081 }
2082
2083 static int domain_context_mapped_cb(struct pci_dev *pdev,
2084                                     u16 alias, void *opaque)
2085 {
2086         struct intel_iommu *iommu = opaque;
2087
2088         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2089 }
2090
2091 static int domain_context_mapped(struct device *dev)
2092 {
2093         struct intel_iommu *iommu;
2094         u8 bus, devfn;
2095
2096         iommu = device_to_iommu(dev, &bus, &devfn);
2097         if (!iommu)
2098                 return -ENODEV;
2099
2100         if (!dev_is_pci(dev))
2101                 return device_context_mapped(iommu, bus, devfn);
2102
2103         return !pci_for_each_dma_alias(to_pci_dev(dev),
2104                                        domain_context_mapped_cb, iommu);
2105 }
2106
2107 /* Returns a number of VTD pages, but aligned to MM page size */
2108 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2109                                             size_t size)
2110 {
2111         host_addr &= ~PAGE_MASK;
2112         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2113 }
2114
2115 /* Return largest possible superpage level for a given mapping */
2116 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2117                                           unsigned long iov_pfn,
2118                                           unsigned long phy_pfn,
2119                                           unsigned long pages)
2120 {
2121         int support, level = 1;
2122         unsigned long pfnmerge;
2123
2124         support = domain->iommu_superpage;
2125
2126         /* To use a large page, the virtual *and* physical addresses
2127            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2128            of them will mean we have to use smaller pages. So just
2129            merge them and check both at once. */
2130         pfnmerge = iov_pfn | phy_pfn;
2131
2132         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2133                 pages >>= VTD_STRIDE_SHIFT;
2134                 if (!pages)
2135                         break;
2136                 pfnmerge >>= VTD_STRIDE_SHIFT;
2137                 level++;
2138                 support--;
2139         }
2140         return level;
2141 }
2142
2143 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2144                             struct scatterlist *sg, unsigned long phys_pfn,
2145                             unsigned long nr_pages, int prot)
2146 {
2147         struct dma_pte *first_pte = NULL, *pte = NULL;
2148         phys_addr_t uninitialized_var(pteval);
2149         unsigned long sg_res = 0;
2150         unsigned int largepage_lvl = 0;
2151         unsigned long lvl_pages = 0;
2152
2153         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2154
2155         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2156                 return -EINVAL;
2157
2158         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2159
2160         if (!sg) {
2161                 sg_res = nr_pages;
2162                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2163         }
2164
2165         while (nr_pages > 0) {
2166                 uint64_t tmp;
2167
2168                 if (!sg_res) {
2169                         sg_res = aligned_nrpages(sg->offset, sg->length);
2170                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2171                         sg->dma_length = sg->length;
2172                         pteval = page_to_phys(sg_page(sg)) | prot;
2173                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2174                 }
2175
2176                 if (!pte) {
2177                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2178
2179                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2180                         if (!pte)
2181                                 return -ENOMEM;
2182                         /* It is large page*/
2183                         if (largepage_lvl > 1) {
2184                                 unsigned long nr_superpages, end_pfn;
2185
2186                                 pteval |= DMA_PTE_LARGE_PAGE;
2187                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2188
2189                                 nr_superpages = sg_res / lvl_pages;
2190                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2191
2192                                 /*
2193                                  * Ensure that old small page tables are
2194                                  * removed to make room for superpage(s).
2195                                  */
2196                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2197                         } else {
2198                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2199                         }
2200
2201                 }
2202                 /* We don't need lock here, nobody else
2203                  * touches the iova range
2204                  */
2205                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2206                 if (tmp) {
2207                         static int dumps = 5;
2208                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2209                                 iov_pfn, tmp, (unsigned long long)pteval);
2210                         if (dumps) {
2211                                 dumps--;
2212                                 debug_dma_dump_mappings(NULL);
2213                         }
2214                         WARN_ON(1);
2215                 }
2216
2217                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2218
2219                 BUG_ON(nr_pages < lvl_pages);
2220                 BUG_ON(sg_res < lvl_pages);
2221
2222                 nr_pages -= lvl_pages;
2223                 iov_pfn += lvl_pages;
2224                 phys_pfn += lvl_pages;
2225                 pteval += lvl_pages * VTD_PAGE_SIZE;
2226                 sg_res -= lvl_pages;
2227
2228                 /* If the next PTE would be the first in a new page, then we
2229                    need to flush the cache on the entries we've just written.
2230                    And then we'll need to recalculate 'pte', so clear it and
2231                    let it get set again in the if (!pte) block above.
2232
2233                    If we're done (!nr_pages) we need to flush the cache too.
2234
2235                    Also if we've been setting superpages, we may need to
2236                    recalculate 'pte' and switch back to smaller pages for the
2237                    end of the mapping, if the trailing size is not enough to
2238                    use another superpage (i.e. sg_res < lvl_pages). */
2239                 pte++;
2240                 if (!nr_pages || first_pte_in_page(pte) ||
2241                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2242                         domain_flush_cache(domain, first_pte,
2243                                            (void *)pte - (void *)first_pte);
2244                         pte = NULL;
2245                 }
2246
2247                 if (!sg_res && nr_pages)
2248                         sg = sg_next(sg);
2249         }
2250         return 0;
2251 }
2252
2253 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2254                                     struct scatterlist *sg, unsigned long nr_pages,
2255                                     int prot)
2256 {
2257         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2258 }
2259
2260 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2261                                      unsigned long phys_pfn, unsigned long nr_pages,
2262                                      int prot)
2263 {
2264         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2265 }
2266
2267 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2268 {
2269         if (!iommu)
2270                 return;
2271
2272         clear_context_table(iommu, bus, devfn);
2273         iommu->flush.flush_context(iommu, 0, 0, 0,
2274                                            DMA_CCMD_GLOBAL_INVL);
2275         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2276 }
2277
2278 static inline void unlink_domain_info(struct device_domain_info *info)
2279 {
2280         assert_spin_locked(&device_domain_lock);
2281         list_del(&info->link);
2282         list_del(&info->global);
2283         if (info->dev)
2284                 info->dev->archdata.iommu = NULL;
2285 }
2286
2287 static void domain_remove_dev_info(struct dmar_domain *domain)
2288 {
2289         struct device_domain_info *info, *tmp;
2290         unsigned long flags;
2291
2292         spin_lock_irqsave(&device_domain_lock, flags);
2293         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2294                 __dmar_remove_one_dev_info(info);
2295         spin_unlock_irqrestore(&device_domain_lock, flags);
2296 }
2297
2298 /*
2299  * find_domain
2300  * Note: we use struct device->archdata.iommu stores the info
2301  */
2302 static struct dmar_domain *find_domain(struct device *dev)
2303 {
2304         struct device_domain_info *info;
2305
2306         /* No lock here, assumes no domain exit in normal case */
2307         info = dev->archdata.iommu;
2308         if (info)
2309                 return info->domain;
2310         return NULL;
2311 }
2312
2313 static inline struct device_domain_info *
2314 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2315 {
2316         struct device_domain_info *info;
2317
2318         list_for_each_entry(info, &device_domain_list, global)
2319                 if (info->iommu->segment == segment && info->bus == bus &&
2320                     info->devfn == devfn)
2321                         return info;
2322
2323         return NULL;
2324 }
2325
2326 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2327                                                     int bus, int devfn,
2328                                                     struct device *dev,
2329                                                     struct dmar_domain *domain)
2330 {
2331         struct dmar_domain *found = NULL;
2332         struct device_domain_info *info;
2333         unsigned long flags;
2334         int ret;
2335
2336         info = alloc_devinfo_mem();
2337         if (!info)
2338                 return NULL;
2339
2340         info->bus = bus;
2341         info->devfn = devfn;
2342         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2343         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2344         info->ats_qdep = 0;
2345         info->dev = dev;
2346         info->domain = domain;
2347         info->iommu = iommu;
2348
2349         if (dev && dev_is_pci(dev)) {
2350                 struct pci_dev *pdev = to_pci_dev(info->dev);
2351
2352                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2353                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2354                     dmar_find_matched_atsr_unit(pdev))
2355                         info->ats_supported = 1;
2356
2357                 if (ecs_enabled(iommu)) {
2358                         if (pasid_enabled(iommu)) {
2359                                 int features = pci_pasid_features(pdev);
2360                                 if (features >= 0)
2361                                         info->pasid_supported = features | 1;
2362                         }
2363
2364                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2365                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2366                                 info->pri_supported = 1;
2367                 }
2368         }
2369
2370         spin_lock_irqsave(&device_domain_lock, flags);
2371         if (dev)
2372                 found = find_domain(dev);
2373
2374         if (!found) {
2375                 struct device_domain_info *info2;
2376                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2377                 if (info2) {
2378                         found      = info2->domain;
2379                         info2->dev = dev;
2380                 }
2381         }
2382
2383         if (found) {
2384                 spin_unlock_irqrestore(&device_domain_lock, flags);
2385                 free_devinfo_mem(info);
2386                 /* Caller must free the original domain */
2387                 return found;
2388         }
2389
2390         spin_lock(&iommu->lock);
2391         ret = domain_attach_iommu(domain, iommu);
2392         spin_unlock(&iommu->lock);
2393
2394         if (ret) {
2395                 spin_unlock_irqrestore(&device_domain_lock, flags);
2396                 free_devinfo_mem(info);
2397                 return NULL;
2398         }
2399
2400         list_add(&info->link, &domain->devices);
2401         list_add(&info->global, &device_domain_list);
2402         if (dev)
2403                 dev->archdata.iommu = info;
2404         spin_unlock_irqrestore(&device_domain_lock, flags);
2405
2406         if (dev && domain_context_mapping(domain, dev)) {
2407                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2408                 dmar_remove_one_dev_info(domain, dev);
2409                 return NULL;
2410         }
2411
2412         return domain;
2413 }
2414
2415 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2416 {
2417         *(u16 *)opaque = alias;
2418         return 0;
2419 }
2420
2421 /* domain is initialized */
2422 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2423 {
2424         struct device_domain_info *info = NULL;
2425         struct dmar_domain *domain, *tmp;
2426         struct intel_iommu *iommu;
2427         u16 req_id, dma_alias;
2428         unsigned long flags;
2429         u8 bus, devfn;
2430
2431         domain = find_domain(dev);
2432         if (domain)
2433                 return domain;
2434
2435         iommu = device_to_iommu(dev, &bus, &devfn);
2436         if (!iommu)
2437                 return NULL;
2438
2439         req_id = ((u16)bus << 8) | devfn;
2440
2441         if (dev_is_pci(dev)) {
2442                 struct pci_dev *pdev = to_pci_dev(dev);
2443
2444                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2445
2446                 spin_lock_irqsave(&device_domain_lock, flags);
2447                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2448                                                       PCI_BUS_NUM(dma_alias),
2449                                                       dma_alias & 0xff);
2450                 if (info) {
2451                         iommu = info->iommu;
2452                         domain = info->domain;
2453                 }
2454                 spin_unlock_irqrestore(&device_domain_lock, flags);
2455
2456                 /* DMA alias already has a domain, uses it */
2457                 if (info)
2458                         goto found_domain;
2459         }
2460
2461         /* Allocate and initialize new domain for the device */
2462         domain = alloc_domain(0);
2463         if (!domain)
2464                 return NULL;
2465         if (domain_init(domain, iommu, gaw)) {
2466                 domain_exit(domain);
2467                 return NULL;
2468         }
2469
2470         /* register PCI DMA alias device */
2471         if (req_id != dma_alias && dev_is_pci(dev)) {
2472                 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2473                                                dma_alias & 0xff, NULL, domain);
2474
2475                 if (!tmp || tmp != domain) {
2476                         domain_exit(domain);
2477                         domain = tmp;
2478                 }
2479
2480                 if (!domain)
2481                         return NULL;
2482         }
2483
2484 found_domain:
2485         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2486
2487         if (!tmp || tmp != domain) {
2488                 domain_exit(domain);
2489                 domain = tmp;
2490         }
2491
2492         return domain;
2493 }
2494
2495 static int iommu_domain_identity_map(struct dmar_domain *domain,
2496                                      unsigned long long start,
2497                                      unsigned long long end)
2498 {
2499         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2500         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2501
2502         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2503                           dma_to_mm_pfn(last_vpfn))) {
2504                 pr_err("Reserving iova failed\n");
2505                 return -ENOMEM;
2506         }
2507
2508         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2509         /*
2510          * RMRR range might have overlap with physical memory range,
2511          * clear it first
2512          */
2513         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2514
2515         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2516                                   last_vpfn - first_vpfn + 1,
2517                                   DMA_PTE_READ|DMA_PTE_WRITE);
2518 }
2519
2520 static int domain_prepare_identity_map(struct device *dev,
2521                                        struct dmar_domain *domain,
2522                                        unsigned long long start,
2523                                        unsigned long long end)
2524 {
2525         /* For _hardware_ passthrough, don't bother. But for software
2526            passthrough, we do it anyway -- it may indicate a memory
2527            range which is reserved in E820, so which didn't get set
2528            up to start with in si_domain */
2529         if (domain == si_domain && hw_pass_through) {
2530                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2531                         dev_name(dev), start, end);
2532                 return 0;
2533         }
2534
2535         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2536                 dev_name(dev), start, end);
2537
2538         if (end < start) {
2539                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2540                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2541                         dmi_get_system_info(DMI_BIOS_VENDOR),
2542                         dmi_get_system_info(DMI_BIOS_VERSION),
2543                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2544                 return -EIO;
2545         }
2546
2547         if (end >> agaw_to_width(domain->agaw)) {
2548                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2549                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2550                      agaw_to_width(domain->agaw),
2551                      dmi_get_system_info(DMI_BIOS_VENDOR),
2552                      dmi_get_system_info(DMI_BIOS_VERSION),
2553                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2554                 return -EIO;
2555         }
2556
2557         return iommu_domain_identity_map(domain, start, end);
2558 }
2559
2560 static int iommu_prepare_identity_map(struct device *dev,
2561                                       unsigned long long start,
2562                                       unsigned long long end)
2563 {
2564         struct dmar_domain *domain;
2565         int ret;
2566
2567         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2568         if (!domain)
2569                 return -ENOMEM;
2570
2571         ret = domain_prepare_identity_map(dev, domain, start, end);
2572         if (ret)
2573                 domain_exit(domain);
2574
2575         return ret;
2576 }
2577
2578 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2579                                          struct device *dev)
2580 {
2581         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2582                 return 0;
2583         return iommu_prepare_identity_map(dev, rmrr->base_address,
2584                                           rmrr->end_address);
2585 }
2586
2587 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2588 static inline void iommu_prepare_isa(void)
2589 {
2590         struct pci_dev *pdev;
2591         int ret;
2592
2593         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2594         if (!pdev)
2595                 return;
2596
2597         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2598         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2599
2600         if (ret)
2601                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2602
2603         pci_dev_put(pdev);
2604 }
2605 #else
2606 static inline void iommu_prepare_isa(void)
2607 {
2608         return;
2609 }
2610 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2611
2612 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2613
2614 static int __init si_domain_init(int hw)
2615 {
2616         int nid, ret = 0;
2617
2618         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2619         if (!si_domain)
2620                 return -EFAULT;
2621
2622         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2623                 domain_exit(si_domain);
2624                 return -EFAULT;
2625         }
2626
2627         pr_debug("Identity mapping domain allocated\n");
2628
2629         if (hw)
2630                 return 0;
2631
2632         for_each_online_node(nid) {
2633                 unsigned long start_pfn, end_pfn;
2634                 int i;
2635
2636                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2637                         ret = iommu_domain_identity_map(si_domain,
2638                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2639                         if (ret)
2640                                 return ret;
2641                 }
2642         }
2643
2644         return 0;
2645 }
2646
2647 static int identity_mapping(struct device *dev)
2648 {
2649         struct device_domain_info *info;
2650
2651         if (likely(!iommu_identity_mapping))
2652                 return 0;
2653
2654         info = dev->archdata.iommu;
2655         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2656                 return (info->domain == si_domain);
2657
2658         return 0;
2659 }
2660
2661 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2662 {
2663         struct dmar_domain *ndomain;
2664         struct intel_iommu *iommu;
2665         u8 bus, devfn;
2666
2667         iommu = device_to_iommu(dev, &bus, &devfn);
2668         if (!iommu)
2669                 return -ENODEV;
2670
2671         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2672         if (ndomain != domain)
2673                 return -EBUSY;
2674
2675         return 0;
2676 }
2677
2678 static bool device_has_rmrr(struct device *dev)
2679 {
2680         struct dmar_rmrr_unit *rmrr;
2681         struct device *tmp;
2682         int i;
2683
2684         rcu_read_lock();
2685         for_each_rmrr_units(rmrr) {
2686                 /*
2687                  * Return TRUE if this RMRR contains the device that
2688                  * is passed in.
2689                  */
2690                 for_each_active_dev_scope(rmrr->devices,
2691                                           rmrr->devices_cnt, i, tmp)
2692                         if (tmp == dev) {
2693                                 rcu_read_unlock();
2694                                 return true;
2695                         }
2696         }
2697         rcu_read_unlock();
2698         return false;
2699 }
2700
2701 /*
2702  * There are a couple cases where we need to restrict the functionality of
2703  * devices associated with RMRRs.  The first is when evaluating a device for
2704  * identity mapping because problems exist when devices are moved in and out
2705  * of domains and their respective RMRR information is lost.  This means that
2706  * a device with associated RMRRs will never be in a "passthrough" domain.
2707  * The second is use of the device through the IOMMU API.  This interface
2708  * expects to have full control of the IOVA space for the device.  We cannot
2709  * satisfy both the requirement that RMRR access is maintained and have an
2710  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2711  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2712  * We therefore prevent devices associated with an RMRR from participating in
2713  * the IOMMU API, which eliminates them from device assignment.
2714  *
2715  * In both cases we assume that PCI USB devices with RMRRs have them largely
2716  * for historical reasons and that the RMRR space is not actively used post
2717  * boot.  This exclusion may change if vendors begin to abuse it.
2718  *
2719  * The same exception is made for graphics devices, with the requirement that
2720  * any use of the RMRR regions will be torn down before assigning the device
2721  * to a guest.
2722  */
2723 static bool device_is_rmrr_locked(struct device *dev)
2724 {
2725         if (!device_has_rmrr(dev))
2726                 return false;
2727
2728         if (dev_is_pci(dev)) {
2729                 struct pci_dev *pdev = to_pci_dev(dev);
2730
2731                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2732                         return false;
2733         }
2734
2735         return true;
2736 }
2737
2738 static int iommu_should_identity_map(struct device *dev, int startup)
2739 {
2740
2741         if (dev_is_pci(dev)) {
2742                 struct pci_dev *pdev = to_pci_dev(dev);
2743
2744                 if (device_is_rmrr_locked(dev))
2745                         return 0;
2746
2747                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2748                         return 1;
2749
2750                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2751                         return 1;
2752
2753                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2754                         return 0;
2755
2756                 /*
2757                  * We want to start off with all devices in the 1:1 domain, and
2758                  * take them out later if we find they can't access all of memory.
2759                  *
2760                  * However, we can't do this for PCI devices behind bridges,
2761                  * because all PCI devices behind the same bridge will end up
2762                  * with the same source-id on their transactions.
2763                  *
2764                  * Practically speaking, we can't change things around for these
2765                  * devices at run-time, because we can't be sure there'll be no
2766                  * DMA transactions in flight for any of their siblings.
2767                  *
2768                  * So PCI devices (unless they're on the root bus) as well as
2769                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2770                  * the 1:1 domain, just in _case_ one of their siblings turns out
2771                  * not to be able to map all of memory.
2772                  */
2773                 if (!pci_is_pcie(pdev)) {
2774                         if (!pci_is_root_bus(pdev->bus))
2775                                 return 0;
2776                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2777                                 return 0;
2778                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2779                         return 0;
2780         } else {
2781                 if (device_has_rmrr(dev))
2782                         return 0;
2783         }
2784
2785         /*
2786          * At boot time, we don't yet know if devices will be 64-bit capable.
2787          * Assume that they will — if they turn out not to be, then we can
2788          * take them out of the 1:1 domain later.
2789          */
2790         if (!startup) {
2791                 /*
2792                  * If the device's dma_mask is less than the system's memory
2793                  * size then this is not a candidate for identity mapping.
2794                  */
2795                 u64 dma_mask = *dev->dma_mask;
2796
2797                 if (dev->coherent_dma_mask &&
2798                     dev->coherent_dma_mask < dma_mask)
2799                         dma_mask = dev->coherent_dma_mask;
2800
2801                 return dma_mask >= dma_get_required_mask(dev);
2802         }
2803
2804         return 1;
2805 }
2806
2807 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2808 {
2809         int ret;
2810
2811         if (!iommu_should_identity_map(dev, 1))
2812                 return 0;
2813
2814         ret = domain_add_dev_info(si_domain, dev);
2815         if (!ret)
2816                 pr_info("%s identity mapping for device %s\n",
2817                         hw ? "Hardware" : "Software", dev_name(dev));
2818         else if (ret == -ENODEV)
2819                 /* device not associated with an iommu */
2820                 ret = 0;
2821
2822         return ret;
2823 }
2824
2825
2826 static int __init iommu_prepare_static_identity_mapping(int hw)
2827 {
2828         struct pci_dev *pdev = NULL;
2829         struct dmar_drhd_unit *drhd;
2830         struct intel_iommu *iommu;
2831         struct device *dev;
2832         int i;
2833         int ret = 0;
2834
2835         for_each_pci_dev(pdev) {
2836                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2837                 if (ret)
2838                         return ret;
2839         }
2840
2841         for_each_active_iommu(iommu, drhd)
2842                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2843                         struct acpi_device_physical_node *pn;
2844                         struct acpi_device *adev;
2845
2846                         if (dev->bus != &acpi_bus_type)
2847                                 continue;
2848
2849                         adev= to_acpi_device(dev);
2850                         mutex_lock(&adev->physical_node_lock);
2851                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2852                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2853                                 if (ret)
2854                                         break;
2855                         }
2856                         mutex_unlock(&adev->physical_node_lock);
2857                         if (ret)
2858                                 return ret;
2859                 }
2860
2861         return 0;
2862 }
2863
2864 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2865 {
2866         /*
2867          * Start from the sane iommu hardware state.
2868          * If the queued invalidation is already initialized by us
2869          * (for example, while enabling interrupt-remapping) then
2870          * we got the things already rolling from a sane state.
2871          */
2872         if (!iommu->qi) {
2873                 /*
2874                  * Clear any previous faults.
2875                  */
2876                 dmar_fault(-1, iommu);
2877                 /*
2878                  * Disable queued invalidation if supported and already enabled
2879                  * before OS handover.
2880                  */
2881                 dmar_disable_qi(iommu);
2882         }
2883
2884         if (dmar_enable_qi(iommu)) {
2885                 /*
2886                  * Queued Invalidate not enabled, use Register Based Invalidate
2887                  */
2888                 iommu->flush.flush_context = __iommu_flush_context;
2889                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2890                 pr_info("%s: Using Register based invalidation\n",
2891                         iommu->name);
2892         } else {
2893                 iommu->flush.flush_context = qi_flush_context;
2894                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2895                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2896         }
2897 }
2898
2899 static int copy_context_table(struct intel_iommu *iommu,
2900                               struct root_entry *old_re,
2901                               struct context_entry **tbl,
2902                               int bus, bool ext)
2903 {
2904         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2905         struct context_entry *new_ce = NULL, ce;
2906         struct context_entry *old_ce = NULL;
2907         struct root_entry re;
2908         phys_addr_t old_ce_phys;
2909
2910         tbl_idx = ext ? bus * 2 : bus;
2911         memcpy(&re, old_re, sizeof(re));
2912
2913         for (devfn = 0; devfn < 256; devfn++) {
2914                 /* First calculate the correct index */
2915                 idx = (ext ? devfn * 2 : devfn) % 256;
2916
2917                 if (idx == 0) {
2918                         /* First save what we may have and clean up */
2919                         if (new_ce) {
2920                                 tbl[tbl_idx] = new_ce;
2921                                 __iommu_flush_cache(iommu, new_ce,
2922                                                     VTD_PAGE_SIZE);
2923                                 pos = 1;
2924                         }
2925
2926                         if (old_ce)
2927                                 iounmap(old_ce);
2928
2929                         ret = 0;
2930                         if (devfn < 0x80)
2931                                 old_ce_phys = root_entry_lctp(&re);
2932                         else
2933                                 old_ce_phys = root_entry_uctp(&re);
2934
2935                         if (!old_ce_phys) {
2936                                 if (ext && devfn == 0) {
2937                                         /* No LCTP, try UCTP */
2938                                         devfn = 0x7f;
2939                                         continue;
2940                                 } else {
2941                                         goto out;
2942                                 }
2943                         }
2944
2945                         ret = -ENOMEM;
2946                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2947                                         MEMREMAP_WB);
2948                         if (!old_ce)
2949                                 goto out;
2950
2951                         new_ce = alloc_pgtable_page(iommu->node);
2952                         if (!new_ce)
2953                                 goto out_unmap;
2954
2955                         ret = 0;
2956                 }
2957
2958                 /* Now copy the context entry */
2959                 memcpy(&ce, old_ce + idx, sizeof(ce));
2960
2961                 if (!__context_present(&ce))
2962                         continue;
2963
2964                 did = context_domain_id(&ce);
2965                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2966                         set_bit(did, iommu->domain_ids);
2967
2968                 /*
2969                  * We need a marker for copied context entries. This
2970                  * marker needs to work for the old format as well as
2971                  * for extended context entries.
2972                  *
2973                  * Bit 67 of the context entry is used. In the old
2974                  * format this bit is available to software, in the
2975                  * extended format it is the PGE bit, but PGE is ignored
2976                  * by HW if PASIDs are disabled (and thus still
2977                  * available).
2978                  *
2979                  * So disable PASIDs first and then mark the entry
2980                  * copied. This means that we don't copy PASID
2981                  * translations from the old kernel, but this is fine as
2982                  * faults there are not fatal.
2983                  */
2984                 context_clear_pasid_enable(&ce);
2985                 context_set_copied(&ce);
2986
2987                 new_ce[idx] = ce;
2988         }
2989
2990         tbl[tbl_idx + pos] = new_ce;
2991
2992         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2993
2994 out_unmap:
2995         memunmap(old_ce);
2996
2997 out:
2998         return ret;
2999 }
3000
3001 static int copy_translation_tables(struct intel_iommu *iommu)
3002 {
3003         struct context_entry **ctxt_tbls;
3004         struct root_entry *old_rt;
3005         phys_addr_t old_rt_phys;
3006         int ctxt_table_entries;
3007         unsigned long flags;
3008         u64 rtaddr_reg;
3009         int bus, ret;
3010         bool new_ext, ext;
3011
3012         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3013         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3014         new_ext    = !!ecap_ecs(iommu->ecap);
3015
3016         /*
3017          * The RTT bit can only be changed when translation is disabled,
3018          * but disabling translation means to open a window for data
3019          * corruption. So bail out and don't copy anything if we would
3020          * have to change the bit.
3021          */
3022         if (new_ext != ext)
3023                 return -EINVAL;
3024
3025         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3026         if (!old_rt_phys)
3027                 return -EINVAL;
3028
3029         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3030         if (!old_rt)
3031                 return -ENOMEM;
3032
3033         /* This is too big for the stack - allocate it from slab */
3034         ctxt_table_entries = ext ? 512 : 256;
3035         ret = -ENOMEM;
3036         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3037         if (!ctxt_tbls)
3038                 goto out_unmap;
3039
3040         for (bus = 0; bus < 256; bus++) {
3041                 ret = copy_context_table(iommu, &old_rt[bus],
3042                                          ctxt_tbls, bus, ext);
3043                 if (ret) {
3044                         pr_err("%s: Failed to copy context table for bus %d\n",
3045                                 iommu->name, bus);
3046                         continue;
3047                 }
3048         }
3049
3050         spin_lock_irqsave(&iommu->lock, flags);
3051
3052         /* Context tables are copied, now write them to the root_entry table */
3053         for (bus = 0; bus < 256; bus++) {
3054                 int idx = ext ? bus * 2 : bus;
3055                 u64 val;
3056
3057                 if (ctxt_tbls[idx]) {
3058                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3059                         iommu->root_entry[bus].lo = val;
3060                 }
3061
3062                 if (!ext || !ctxt_tbls[idx + 1])
3063                         continue;
3064
3065                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3066                 iommu->root_entry[bus].hi = val;
3067         }
3068
3069         spin_unlock_irqrestore(&iommu->lock, flags);
3070
3071         kfree(ctxt_tbls);
3072
3073         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3074
3075         ret = 0;
3076
3077 out_unmap:
3078         memunmap(old_rt);
3079
3080         return ret;
3081 }
3082
3083 static int __init init_dmars(void)
3084 {
3085         struct dmar_drhd_unit *drhd;
3086         struct dmar_rmrr_unit *rmrr;
3087         bool copied_tables = false;
3088         struct device *dev;
3089         struct intel_iommu *iommu;
3090         int i, ret;
3091
3092         /*
3093          * for each drhd
3094          *    allocate root
3095          *    initialize and program root entry to not present
3096          * endfor
3097          */
3098         for_each_drhd_unit(drhd) {
3099                 /*
3100                  * lock not needed as this is only incremented in the single
3101                  * threaded kernel __init code path all other access are read
3102                  * only
3103                  */
3104                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3105                         g_num_of_iommus++;
3106                         continue;
3107                 }
3108                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3109         }
3110
3111         /* Preallocate enough resources for IOMMU hot-addition */
3112         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3113                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3114
3115         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3116                         GFP_KERNEL);
3117         if (!g_iommus) {
3118                 pr_err("Allocating global iommu array failed\n");
3119                 ret = -ENOMEM;
3120                 goto error;
3121         }
3122
3123         deferred_flush = kzalloc(g_num_of_iommus *
3124                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3125         if (!deferred_flush) {
3126                 ret = -ENOMEM;
3127                 goto free_g_iommus;
3128         }
3129
3130         for_each_active_iommu(iommu, drhd) {
3131                 g_iommus[iommu->seq_id] = iommu;
3132
3133                 intel_iommu_init_qi(iommu);
3134
3135                 ret = iommu_init_domains(iommu);
3136                 if (ret)
3137                         goto free_iommu;
3138
3139                 init_translation_status(iommu);
3140
3141                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3142                         iommu_disable_translation(iommu);
3143                         clear_translation_pre_enabled(iommu);
3144                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3145                                 iommu->name);
3146                 }
3147
3148                 /*
3149                  * TBD:
3150                  * we could share the same root & context tables
3151                  * among all IOMMU's. Need to Split it later.
3152                  */
3153                 ret = iommu_alloc_root_entry(iommu);
3154                 if (ret)
3155                         goto free_iommu;
3156
3157                 if (translation_pre_enabled(iommu)) {
3158                         pr_info("Translation already enabled - trying to copy translation structures\n");
3159
3160                         ret = copy_translation_tables(iommu);
3161                         if (ret) {
3162                                 /*
3163                                  * We found the IOMMU with translation
3164                                  * enabled - but failed to copy over the
3165                                  * old root-entry table. Try to proceed
3166                                  * by disabling translation now and
3167                                  * allocating a clean root-entry table.
3168                                  * This might cause DMAR faults, but
3169                                  * probably the dump will still succeed.
3170                                  */
3171                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3172                                        iommu->name);
3173                                 iommu_disable_translation(iommu);
3174                                 clear_translation_pre_enabled(iommu);
3175                         } else {
3176                                 pr_info("Copied translation tables from previous kernel for %s\n",
3177                                         iommu->name);
3178                                 copied_tables = true;
3179                         }
3180                 }
3181
3182                 if (!ecap_pass_through(iommu->ecap))
3183                         hw_pass_through = 0;
3184 #ifdef CONFIG_INTEL_IOMMU_SVM
3185                 if (pasid_enabled(iommu))
3186                         intel_svm_alloc_pasid_tables(iommu);
3187 #endif
3188         }
3189
3190         /*
3191          * Now that qi is enabled on all iommus, set the root entry and flush
3192          * caches. This is required on some Intel X58 chipsets, otherwise the
3193          * flush_context function will loop forever and the boot hangs.
3194          */
3195         for_each_active_iommu(iommu, drhd) {
3196                 iommu_flush_write_buffer(iommu);
3197                 iommu_set_root_entry(iommu);
3198                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3199                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3200         }
3201
3202         if (iommu_pass_through)
3203                 iommu_identity_mapping |= IDENTMAP_ALL;
3204
3205 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3206         iommu_identity_mapping |= IDENTMAP_GFX;
3207 #endif
3208
3209         if (iommu_identity_mapping) {
3210                 ret = si_domain_init(hw_pass_through);
3211                 if (ret)
3212                         goto free_iommu;
3213         }
3214
3215         check_tylersburg_isoch();
3216
3217         /*
3218          * If we copied translations from a previous kernel in the kdump
3219          * case, we can not assign the devices to domains now, as that
3220          * would eliminate the old mappings. So skip this part and defer
3221          * the assignment to device driver initialization time.
3222          */
3223         if (copied_tables)
3224                 goto domains_done;
3225
3226         /*
3227          * If pass through is not set or not enabled, setup context entries for
3228          * identity mappings for rmrr, gfx, and isa and may fall back to static
3229          * identity mapping if iommu_identity_mapping is set.
3230          */
3231         if (iommu_identity_mapping) {
3232                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3233                 if (ret) {
3234                         pr_crit("Failed to setup IOMMU pass-through\n");
3235                         goto free_iommu;
3236                 }
3237         }
3238         /*
3239          * For each rmrr
3240          *   for each dev attached to rmrr
3241          *   do
3242          *     locate drhd for dev, alloc domain for dev
3243          *     allocate free domain
3244          *     allocate page table entries for rmrr
3245          *     if context not allocated for bus
3246          *           allocate and init context
3247          *           set present in root table for this bus
3248          *     init context with domain, translation etc
3249          *    endfor
3250          * endfor
3251          */
3252         pr_info("Setting RMRR:\n");
3253         for_each_rmrr_units(rmrr) {
3254                 /* some BIOS lists non-exist devices in DMAR table. */
3255                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3256                                           i, dev) {
3257                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3258                         if (ret)
3259                                 pr_err("Mapping reserved region failed\n");
3260                 }
3261         }
3262
3263         iommu_prepare_isa();
3264
3265 domains_done:
3266
3267         /*
3268          * for each drhd
3269          *   enable fault log
3270          *   global invalidate context cache
3271          *   global invalidate iotlb
3272          *   enable translation
3273          */
3274         for_each_iommu(iommu, drhd) {
3275                 if (drhd->ignored) {
3276                         /*
3277                          * we always have to disable PMRs or DMA may fail on
3278                          * this device
3279                          */
3280                         if (force_on)
3281                                 iommu_disable_protect_mem_regions(iommu);
3282                         continue;
3283                 }
3284
3285                 iommu_flush_write_buffer(iommu);
3286
3287 #ifdef CONFIG_INTEL_IOMMU_SVM
3288                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3289                         ret = intel_svm_enable_prq(iommu);
3290                         if (ret)
3291                                 goto free_iommu;
3292                 }
3293 #endif
3294                 ret = dmar_set_interrupt(iommu);
3295                 if (ret)
3296                         goto free_iommu;
3297
3298                 if (!translation_pre_enabled(iommu))
3299                         iommu_enable_translation(iommu);
3300
3301                 iommu_disable_protect_mem_regions(iommu);
3302         }
3303
3304         return 0;
3305
3306 free_iommu:
3307         for_each_active_iommu(iommu, drhd) {
3308                 disable_dmar_iommu(iommu);
3309                 free_dmar_iommu(iommu);
3310         }
3311         kfree(deferred_flush);
3312 free_g_iommus:
3313         kfree(g_iommus);
3314 error:
3315         return ret;
3316 }
3317
3318 /* This takes a number of _MM_ pages, not VTD pages */
3319 static struct iova *intel_alloc_iova(struct device *dev,
3320                                      struct dmar_domain *domain,
3321                                      unsigned long nrpages, uint64_t dma_mask)
3322 {
3323         struct iova *iova = NULL;
3324
3325         /* Restrict dma_mask to the width that the iommu can handle */
3326         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3327         /* Ensure we reserve the whole size-aligned region */
3328         nrpages = __roundup_pow_of_two(nrpages);
3329
3330         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3331                 /*
3332                  * First try to allocate an io virtual address in
3333                  * DMA_BIT_MASK(32) and if that fails then try allocating
3334                  * from higher range
3335                  */
3336                 iova = alloc_iova(&domain->iovad, nrpages,
3337                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3338                 if (iova)
3339                         return iova;
3340         }
3341         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3342         if (unlikely(!iova)) {
3343                 pr_err("Allocating %ld-page iova for %s failed",
3344                        nrpages, dev_name(dev));
3345                 return NULL;
3346         }
3347
3348         return iova;
3349 }
3350
3351 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3352 {
3353         struct dmar_rmrr_unit *rmrr;
3354         struct dmar_domain *domain;
3355         struct device *i_dev;
3356         int i, ret;
3357
3358         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3359         if (!domain) {
3360                 pr_err("Allocating domain for %s failed\n",
3361                        dev_name(dev));
3362                 return NULL;
3363         }
3364
3365         /* We have a new domain - setup possible RMRRs for the device */
3366         rcu_read_lock();
3367         for_each_rmrr_units(rmrr) {
3368                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3369                                           i, i_dev) {
3370                         if (i_dev != dev)
3371                                 continue;
3372
3373                         ret = domain_prepare_identity_map(dev, domain,
3374                                                           rmrr->base_address,
3375                                                           rmrr->end_address);
3376                         if (ret)
3377                                 dev_err(dev, "Mapping reserved region failed\n");
3378                 }
3379         }
3380         rcu_read_unlock();
3381
3382         return domain;
3383 }
3384
3385 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3386 {
3387         struct device_domain_info *info;
3388
3389         /* No lock here, assumes no domain exit in normal case */
3390         info = dev->archdata.iommu;
3391         if (likely(info))
3392                 return info->domain;
3393
3394         return __get_valid_domain_for_dev(dev);
3395 }
3396
3397 /* Check if the dev needs to go through non-identity map and unmap process.*/
3398 static int iommu_no_mapping(struct device *dev)
3399 {
3400         int found;
3401
3402         if (iommu_dummy(dev))
3403                 return 1;
3404
3405         if (!iommu_identity_mapping)
3406                 return 0;
3407
3408         found = identity_mapping(dev);
3409         if (found) {
3410                 if (iommu_should_identity_map(dev, 0))
3411                         return 1;
3412                 else {
3413                         /*
3414                          * 32 bit DMA is removed from si_domain and fall back
3415                          * to non-identity mapping.
3416                          */
3417                         dmar_remove_one_dev_info(si_domain, dev);
3418                         pr_info("32bit %s uses non-identity mapping\n",
3419                                 dev_name(dev));
3420                         return 0;
3421                 }
3422         } else {
3423                 /*
3424                  * In case of a detached 64 bit DMA device from vm, the device
3425                  * is put into si_domain for identity mapping.
3426                  */
3427                 if (iommu_should_identity_map(dev, 0)) {
3428                         int ret;
3429                         ret = domain_add_dev_info(si_domain, dev);
3430                         if (!ret) {
3431                                 pr_info("64bit %s uses identity mapping\n",
3432                                         dev_name(dev));
3433                                 return 1;
3434                         }
3435                 }
3436         }
3437
3438         return 0;
3439 }
3440
3441 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3442                                      size_t size, int dir, u64 dma_mask)
3443 {
3444         struct dmar_domain *domain;
3445         phys_addr_t start_paddr;
3446         struct iova *iova;
3447         int prot = 0;
3448         int ret;
3449         struct intel_iommu *iommu;
3450         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3451
3452         BUG_ON(dir == DMA_NONE);
3453
3454         if (iommu_no_mapping(dev))
3455                 return paddr;
3456
3457         domain = get_valid_domain_for_dev(dev);
3458         if (!domain)
3459                 return 0;
3460
3461         iommu = domain_get_iommu(domain);
3462         size = aligned_nrpages(paddr, size);
3463
3464         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3465         if (!iova)
3466                 goto error;
3467
3468         /*
3469          * Check if DMAR supports zero-length reads on write only
3470          * mappings..
3471          */
3472         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3473                         !cap_zlr(iommu->cap))
3474                 prot |= DMA_PTE_READ;
3475         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3476                 prot |= DMA_PTE_WRITE;
3477         /*
3478          * paddr - (paddr + size) might be partial page, we should map the whole
3479          * page.  Note: if two part of one page are separately mapped, we
3480          * might have two guest_addr mapping to the same host paddr, but this
3481          * is not a big problem
3482          */
3483         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3484                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3485         if (ret)
3486                 goto error;
3487
3488         /* it's a non-present to present mapping. Only flush if caching mode */
3489         if (cap_caching_mode(iommu->cap))
3490                 iommu_flush_iotlb_psi(iommu, domain,
3491                                       mm_to_dma_pfn(iova->pfn_lo),
3492                                       size, 0, 1);
3493         else
3494                 iommu_flush_write_buffer(iommu);
3495
3496         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3497         start_paddr += paddr & ~PAGE_MASK;
3498         return start_paddr;
3499
3500 error:
3501         if (iova)
3502                 __free_iova(&domain->iovad, iova);
3503         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3504                 dev_name(dev), size, (unsigned long long)paddr, dir);
3505         return 0;
3506 }
3507
3508 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3509                                  unsigned long offset, size_t size,
3510                                  enum dma_data_direction dir,
3511                                  struct dma_attrs *attrs)
3512 {
3513         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3514                                   dir, *dev->dma_mask);
3515 }
3516
3517 static void flush_unmaps(void)
3518 {
3519         int i, j;
3520
3521         timer_on = 0;
3522
3523         /* just flush them all */
3524         for (i = 0; i < g_num_of_iommus; i++) {
3525                 struct intel_iommu *iommu = g_iommus[i];
3526                 if (!iommu)
3527                         continue;
3528
3529                 if (!deferred_flush[i].next)
3530                         continue;
3531
3532                 /* In caching mode, global flushes turn emulation expensive */
3533                 if (!cap_caching_mode(iommu->cap))
3534                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3535                                          DMA_TLB_GLOBAL_FLUSH);
3536                 for (j = 0; j < deferred_flush[i].next; j++) {
3537                         unsigned long mask;
3538                         struct iova *iova = deferred_flush[i].iova[j];
3539                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3540
3541                         /* On real hardware multiple invalidations are expensive */
3542                         if (cap_caching_mode(iommu->cap))
3543                                 iommu_flush_iotlb_psi(iommu, domain,
3544                                         iova->pfn_lo, iova_size(iova),
3545                                         !deferred_flush[i].freelist[j], 0);
3546                         else {
3547                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3548                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3549                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3550                         }
3551                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3552                         if (deferred_flush[i].freelist[j])
3553                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3554                 }
3555                 deferred_flush[i].next = 0;
3556         }
3557
3558         list_size = 0;
3559 }
3560
3561 static void flush_unmaps_timeout(unsigned long data)
3562 {
3563         unsigned long flags;
3564
3565         spin_lock_irqsave(&async_umap_flush_lock, flags);
3566         flush_unmaps();
3567         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3568 }
3569
3570 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3571 {
3572         unsigned long flags;
3573         int next, iommu_id;
3574         struct intel_iommu *iommu;
3575
3576         spin_lock_irqsave(&async_umap_flush_lock, flags);
3577         if (list_size == HIGH_WATER_MARK)
3578                 flush_unmaps();
3579
3580         iommu = domain_get_iommu(dom);
3581         iommu_id = iommu->seq_id;
3582
3583         next = deferred_flush[iommu_id].next;
3584         deferred_flush[iommu_id].domain[next] = dom;
3585         deferred_flush[iommu_id].iova[next] = iova;
3586         deferred_flush[iommu_id].freelist[next] = freelist;
3587         deferred_flush[iommu_id].next++;
3588
3589         if (!timer_on) {
3590                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3591                 timer_on = 1;
3592         }
3593         list_size++;
3594         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3595 }
3596
3597 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3598 {
3599         struct dmar_domain *domain;
3600         unsigned long start_pfn, last_pfn;
3601         struct iova *iova;
3602         struct intel_iommu *iommu;
3603         struct page *freelist;
3604
3605         if (iommu_no_mapping(dev))
3606                 return;
3607
3608         domain = find_domain(dev);
3609         BUG_ON(!domain);
3610
3611         iommu = domain_get_iommu(domain);
3612
3613         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3614         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3615                       (unsigned long long)dev_addr))
3616                 return;
3617
3618         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3619         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3620
3621         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3622                  dev_name(dev), start_pfn, last_pfn);
3623
3624         freelist = domain_unmap(domain, start_pfn, last_pfn);
3625
3626         if (intel_iommu_strict) {
3627                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3628                                       last_pfn - start_pfn + 1, !freelist, 0);
3629                 /* free iova */
3630                 __free_iova(&domain->iovad, iova);
3631                 dma_free_pagelist(freelist);
3632         } else {
3633                 add_unmap(domain, iova, freelist);
3634                 /*
3635                  * queue up the release of the unmap to save the 1/6th of the
3636                  * cpu used up by the iotlb flush operation...
3637                  */
3638         }
3639 }
3640
3641 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3642                              size_t size, enum dma_data_direction dir,
3643                              struct dma_attrs *attrs)
3644 {
3645         intel_unmap(dev, dev_addr);
3646 }
3647
3648 static void *intel_alloc_coherent(struct device *dev, size_t size,
3649                                   dma_addr_t *dma_handle, gfp_t flags,
3650                                   struct dma_attrs *attrs)
3651 {
3652         struct page *page = NULL;
3653         int order;
3654
3655         size = PAGE_ALIGN(size);
3656         order = get_order(size);
3657
3658         if (!iommu_no_mapping(dev))
3659                 flags &= ~(GFP_DMA | GFP_DMA32);
3660         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3661                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3662                         flags |= GFP_DMA;
3663                 else
3664                         flags |= GFP_DMA32;
3665         }
3666
3667         if (gfpflags_allow_blocking(flags)) {
3668                 unsigned int count = size >> PAGE_SHIFT;
3669
3670                 page = dma_alloc_from_contiguous(dev, count, order);
3671                 if (page && iommu_no_mapping(dev) &&
3672                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3673                         dma_release_from_contiguous(dev, page, count);
3674                         page = NULL;
3675                 }
3676         }
3677
3678         if (!page)
3679                 page = alloc_pages(flags, order);
3680         if (!page)
3681                 return NULL;
3682         memset(page_address(page), 0, size);
3683
3684         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3685                                          DMA_BIDIRECTIONAL,
3686                                          dev->coherent_dma_mask);
3687         if (*dma_handle)
3688                 return page_address(page);
3689         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3690                 __free_pages(page, order);
3691
3692         return NULL;
3693 }
3694
3695 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3696                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3697 {
3698         int order;
3699         struct page *page = virt_to_page(vaddr);
3700
3701         size = PAGE_ALIGN(size);
3702         order = get_order(size);
3703
3704         intel_unmap(dev, dma_handle);
3705         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3706                 __free_pages(page, order);
3707 }
3708
3709 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3710                            int nelems, enum dma_data_direction dir,
3711                            struct dma_attrs *attrs)
3712 {
3713         intel_unmap(dev, sglist[0].dma_address);
3714 }
3715
3716 static int intel_nontranslate_map_sg(struct device *hddev,
3717         struct scatterlist *sglist, int nelems, int dir)
3718 {
3719         int i;
3720         struct scatterlist *sg;
3721
3722         for_each_sg(sglist, sg, nelems, i) {
3723                 BUG_ON(!sg_page(sg));
3724                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3725                 sg->dma_length = sg->length;
3726         }
3727         return nelems;
3728 }
3729
3730 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3731                         enum dma_data_direction dir, struct dma_attrs *attrs)
3732 {
3733         int i;
3734         struct dmar_domain *domain;
3735         size_t size = 0;
3736         int prot = 0;
3737         struct iova *iova = NULL;
3738         int ret;
3739         struct scatterlist *sg;
3740         unsigned long start_vpfn;
3741         struct intel_iommu *iommu;
3742
3743         BUG_ON(dir == DMA_NONE);
3744         if (iommu_no_mapping(dev))
3745                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3746
3747         domain = get_valid_domain_for_dev(dev);
3748         if (!domain)
3749                 return 0;
3750
3751         iommu = domain_get_iommu(domain);
3752
3753         for_each_sg(sglist, sg, nelems, i)
3754                 size += aligned_nrpages(sg->offset, sg->length);
3755
3756         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3757                                 *dev->dma_mask);
3758         if (!iova) {
3759                 sglist->dma_length = 0;
3760                 return 0;
3761         }
3762
3763         /*
3764          * Check if DMAR supports zero-length reads on write only
3765          * mappings..
3766          */
3767         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3768                         !cap_zlr(iommu->cap))
3769                 prot |= DMA_PTE_READ;
3770         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3771                 prot |= DMA_PTE_WRITE;
3772
3773         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3774
3775         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3776         if (unlikely(ret)) {
3777                 dma_pte_free_pagetable(domain, start_vpfn,
3778                                        start_vpfn + size - 1);
3779                 __free_iova(&domain->iovad, iova);
3780                 return 0;
3781         }
3782
3783         /* it's a non-present to present mapping. Only flush if caching mode */
3784         if (cap_caching_mode(iommu->cap))
3785                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3786         else
3787                 iommu_flush_write_buffer(iommu);
3788
3789         return nelems;
3790 }
3791
3792 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3793 {
3794         return !dma_addr;
3795 }
3796
3797 struct dma_map_ops intel_dma_ops = {
3798         .alloc = intel_alloc_coherent,
3799         .free = intel_free_coherent,
3800         .map_sg = intel_map_sg,
3801         .unmap_sg = intel_unmap_sg,
3802         .map_page = intel_map_page,
3803         .unmap_page = intel_unmap_page,
3804         .mapping_error = intel_mapping_error,
3805 };
3806
3807 static inline int iommu_domain_cache_init(void)
3808 {
3809         int ret = 0;
3810
3811         iommu_domain_cache = kmem_cache_create("iommu_domain",
3812                                          sizeof(struct dmar_domain),
3813                                          0,
3814                                          SLAB_HWCACHE_ALIGN,
3815
3816                                          NULL);
3817         if (!iommu_domain_cache) {
3818                 pr_err("Couldn't create iommu_domain cache\n");
3819                 ret = -ENOMEM;
3820         }
3821
3822         return ret;
3823 }
3824
3825 static inline int iommu_devinfo_cache_init(void)
3826 {
3827         int ret = 0;
3828
3829         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3830                                          sizeof(struct device_domain_info),
3831                                          0,
3832                                          SLAB_HWCACHE_ALIGN,
3833                                          NULL);
3834         if (!iommu_devinfo_cache) {
3835                 pr_err("Couldn't create devinfo cache\n");
3836                 ret = -ENOMEM;
3837         }
3838
3839         return ret;
3840 }
3841
3842 static int __init iommu_init_mempool(void)
3843 {
3844         int ret;
3845         ret = iova_cache_get();
3846         if (ret)
3847                 return ret;
3848
3849         ret = iommu_domain_cache_init();
3850         if (ret)
3851                 goto domain_error;
3852
3853         ret = iommu_devinfo_cache_init();
3854         if (!ret)
3855                 return ret;
3856
3857         kmem_cache_destroy(iommu_domain_cache);
3858 domain_error:
3859         iova_cache_put();
3860
3861         return -ENOMEM;
3862 }
3863
3864 static void __init iommu_exit_mempool(void)
3865 {
3866         kmem_cache_destroy(iommu_devinfo_cache);
3867         kmem_cache_destroy(iommu_domain_cache);
3868         iova_cache_put();
3869 }
3870
3871 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3872 {
3873         struct dmar_drhd_unit *drhd;
3874         u32 vtbar;
3875         int rc;
3876
3877         /* We know that this device on this chipset has its own IOMMU.
3878          * If we find it under a different IOMMU, then the BIOS is lying
3879          * to us. Hope that the IOMMU for this device is actually
3880          * disabled, and it needs no translation...
3881          */
3882         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3883         if (rc) {
3884                 /* "can't" happen */
3885                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3886                 return;
3887         }
3888         vtbar &= 0xffff0000;
3889
3890         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3891         drhd = dmar_find_matched_drhd_unit(pdev);
3892         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3893                             TAINT_FIRMWARE_WORKAROUND,
3894                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3895                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3896 }
3897 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3898
3899 static void __init init_no_remapping_devices(void)
3900 {
3901         struct dmar_drhd_unit *drhd;
3902         struct device *dev;
3903         int i;
3904
3905         for_each_drhd_unit(drhd) {
3906                 if (!drhd->include_all) {
3907                         for_each_active_dev_scope(drhd->devices,
3908                                                   drhd->devices_cnt, i, dev)
3909                                 break;
3910                         /* ignore DMAR unit if no devices exist */
3911                         if (i == drhd->devices_cnt)
3912                                 drhd->ignored = 1;
3913                 }
3914         }
3915
3916         for_each_active_drhd_unit(drhd) {
3917                 if (drhd->include_all)
3918                         continue;
3919
3920                 for_each_active_dev_scope(drhd->devices,
3921                                           drhd->devices_cnt, i, dev)
3922                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3923                                 break;
3924                 if (i < drhd->devices_cnt)
3925                         continue;
3926
3927                 /* This IOMMU has *only* gfx devices. Either bypass it or
3928                    set the gfx_mapped flag, as appropriate */
3929                 if (dmar_map_gfx) {
3930                         intel_iommu_gfx_mapped = 1;
3931                 } else {
3932                         drhd->ignored = 1;
3933                         for_each_active_dev_scope(drhd->devices,
3934                                                   drhd->devices_cnt, i, dev)
3935                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3936                 }
3937         }
3938 }
3939
3940 #ifdef CONFIG_SUSPEND
3941 static int init_iommu_hw(void)
3942 {
3943         struct dmar_drhd_unit *drhd;
3944         struct intel_iommu *iommu = NULL;
3945
3946         for_each_active_iommu(iommu, drhd)
3947                 if (iommu->qi)
3948                         dmar_reenable_qi(iommu);
3949
3950         for_each_iommu(iommu, drhd) {
3951                 if (drhd->ignored) {
3952                         /*
3953                          * we always have to disable PMRs or DMA may fail on
3954                          * this device
3955                          */
3956                         if (force_on)
3957                                 iommu_disable_protect_mem_regions(iommu);
3958                         continue;
3959                 }
3960         
3961                 iommu_flush_write_buffer(iommu);
3962
3963                 iommu_set_root_entry(iommu);
3964
3965                 iommu->flush.flush_context(iommu, 0, 0, 0,
3966                                            DMA_CCMD_GLOBAL_INVL);
3967                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3968                 iommu_enable_translation(iommu);
3969                 iommu_disable_protect_mem_regions(iommu);
3970         }
3971
3972         return 0;
3973 }
3974
3975 static void iommu_flush_all(void)
3976 {
3977         struct dmar_drhd_unit *drhd;
3978         struct intel_iommu *iommu;
3979
3980         for_each_active_iommu(iommu, drhd) {
3981                 iommu->flush.flush_context(iommu, 0, 0, 0,
3982                                            DMA_CCMD_GLOBAL_INVL);
3983                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3984                                          DMA_TLB_GLOBAL_FLUSH);
3985         }
3986 }
3987
3988 static int iommu_suspend(void)
3989 {
3990         struct dmar_drhd_unit *drhd;
3991         struct intel_iommu *iommu = NULL;
3992         unsigned long flag;
3993
3994         for_each_active_iommu(iommu, drhd) {
3995                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3996                                                  GFP_ATOMIC);
3997                 if (!iommu->iommu_state)
3998                         goto nomem;
3999         }
4000
4001         iommu_flush_all();
4002
4003         for_each_active_iommu(iommu, drhd) {
4004                 iommu_disable_translation(iommu);
4005
4006                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4007
4008                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4009                         readl(iommu->reg + DMAR_FECTL_REG);
4010                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4011                         readl(iommu->reg + DMAR_FEDATA_REG);
4012                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4013                         readl(iommu->reg + DMAR_FEADDR_REG);
4014                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4015                         readl(iommu->reg + DMAR_FEUADDR_REG);
4016
4017                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4018         }
4019         return 0;
4020
4021 nomem:
4022         for_each_active_iommu(iommu, drhd)
4023                 kfree(iommu->iommu_state);
4024
4025         return -ENOMEM;
4026 }
4027
4028 static void iommu_resume(void)
4029 {
4030         struct dmar_drhd_unit *drhd;
4031         struct intel_iommu *iommu = NULL;
4032         unsigned long flag;
4033
4034         if (init_iommu_hw()) {
4035                 if (force_on)
4036                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4037                 else
4038                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4039                 return;
4040         }
4041
4042         for_each_active_iommu(iommu, drhd) {
4043
4044                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4045
4046                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4047                         iommu->reg + DMAR_FECTL_REG);
4048                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4049                         iommu->reg + DMAR_FEDATA_REG);
4050                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4051                         iommu->reg + DMAR_FEADDR_REG);
4052                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4053                         iommu->reg + DMAR_FEUADDR_REG);
4054
4055                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4056         }
4057
4058         for_each_active_iommu(iommu, drhd)
4059                 kfree(iommu->iommu_state);
4060 }
4061
4062 static struct syscore_ops iommu_syscore_ops = {
4063         .resume         = iommu_resume,
4064         .suspend        = iommu_suspend,
4065 };
4066
4067 static void __init init_iommu_pm_ops(void)
4068 {
4069         register_syscore_ops(&iommu_syscore_ops);
4070 }
4071
4072 #else
4073 static inline void init_iommu_pm_ops(void) {}
4074 #endif  /* CONFIG_PM */
4075
4076
4077 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4078 {
4079         struct acpi_dmar_reserved_memory *rmrr;
4080         struct dmar_rmrr_unit *rmrru;
4081
4082         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4083         if (!rmrru)
4084                 return -ENOMEM;
4085
4086         rmrru->hdr = header;
4087         rmrr = (struct acpi_dmar_reserved_memory *)header;
4088         rmrru->base_address = rmrr->base_address;
4089         rmrru->end_address = rmrr->end_address;
4090         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4091                                 ((void *)rmrr) + rmrr->header.length,
4092                                 &rmrru->devices_cnt);
4093         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4094                 kfree(rmrru);
4095                 return -ENOMEM;
4096         }
4097
4098         list_add(&rmrru->list, &dmar_rmrr_units);
4099
4100         return 0;
4101 }
4102
4103 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4104 {
4105         struct dmar_atsr_unit *atsru;
4106         struct acpi_dmar_atsr *tmp;
4107
4108         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4109                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4110                 if (atsr->segment != tmp->segment)
4111                         continue;
4112                 if (atsr->header.length != tmp->header.length)
4113                         continue;
4114                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4115                         return atsru;
4116         }
4117
4118         return NULL;
4119 }
4120
4121 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4122 {
4123         struct acpi_dmar_atsr *atsr;
4124         struct dmar_atsr_unit *atsru;
4125
4126         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4127                 return 0;
4128
4129         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4130         atsru = dmar_find_atsr(atsr);
4131         if (atsru)
4132                 return 0;
4133
4134         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4135         if (!atsru)
4136                 return -ENOMEM;
4137
4138         /*
4139          * If memory is allocated from slab by ACPI _DSM method, we need to
4140          * copy the memory content because the memory buffer will be freed
4141          * on return.
4142          */
4143         atsru->hdr = (void *)(atsru + 1);
4144         memcpy(atsru->hdr, hdr, hdr->length);
4145         atsru->include_all = atsr->flags & 0x1;
4146         if (!atsru->include_all) {
4147                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4148                                 (void *)atsr + atsr->header.length,
4149                                 &atsru->devices_cnt);
4150                 if (atsru->devices_cnt && atsru->devices == NULL) {
4151                         kfree(atsru);
4152                         return -ENOMEM;
4153                 }
4154         }
4155
4156         list_add_rcu(&atsru->list, &dmar_atsr_units);
4157
4158         return 0;
4159 }
4160
4161 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4162 {
4163         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4164         kfree(atsru);
4165 }
4166
4167 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4168 {
4169         struct acpi_dmar_atsr *atsr;
4170         struct dmar_atsr_unit *atsru;
4171
4172         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4173         atsru = dmar_find_atsr(atsr);
4174         if (atsru) {
4175                 list_del_rcu(&atsru->list);
4176                 synchronize_rcu();
4177                 intel_iommu_free_atsr(atsru);
4178         }
4179
4180         return 0;
4181 }
4182
4183 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4184 {
4185         int i;
4186         struct device *dev;
4187         struct acpi_dmar_atsr *atsr;
4188         struct dmar_atsr_unit *atsru;
4189
4190         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4191         atsru = dmar_find_atsr(atsr);
4192         if (!atsru)
4193                 return 0;
4194
4195         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4196                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4197                                           i, dev)
4198                         return -EBUSY;
4199         }
4200
4201         return 0;
4202 }
4203
4204 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4205 {
4206         int sp, ret = 0;
4207         struct intel_iommu *iommu = dmaru->iommu;
4208
4209         if (g_iommus[iommu->seq_id])
4210                 return 0;
4211
4212         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4213                 pr_warn("%s: Doesn't support hardware pass through.\n",
4214                         iommu->name);
4215                 return -ENXIO;
4216         }
4217         if (!ecap_sc_support(iommu->ecap) &&
4218             domain_update_iommu_snooping(iommu)) {
4219                 pr_warn("%s: Doesn't support snooping.\n",
4220                         iommu->name);
4221                 return -ENXIO;
4222         }
4223         sp = domain_update_iommu_superpage(iommu) - 1;
4224         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4225                 pr_warn("%s: Doesn't support large page.\n",
4226                         iommu->name);
4227                 return -ENXIO;
4228         }
4229
4230         /*
4231          * Disable translation if already enabled prior to OS handover.
4232          */
4233         if (iommu->gcmd & DMA_GCMD_TE)
4234                 iommu_disable_translation(iommu);
4235
4236         g_iommus[iommu->seq_id] = iommu;
4237         ret = iommu_init_domains(iommu);
4238         if (ret == 0)
4239                 ret = iommu_alloc_root_entry(iommu);
4240         if (ret)
4241                 goto out;
4242
4243 #ifdef CONFIG_INTEL_IOMMU_SVM
4244         if (pasid_enabled(iommu))
4245                 intel_svm_alloc_pasid_tables(iommu);
4246 #endif
4247
4248         if (dmaru->ignored) {
4249                 /*
4250                  * we always have to disable PMRs or DMA may fail on this device
4251                  */
4252                 if (force_on)
4253                         iommu_disable_protect_mem_regions(iommu);
4254                 return 0;
4255         }
4256
4257         intel_iommu_init_qi(iommu);
4258         iommu_flush_write_buffer(iommu);
4259
4260 #ifdef CONFIG_INTEL_IOMMU_SVM
4261         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4262                 ret = intel_svm_enable_prq(iommu);
4263                 if (ret)
4264                         goto disable_iommu;
4265         }
4266 #endif
4267         ret = dmar_set_interrupt(iommu);
4268         if (ret)
4269                 goto disable_iommu;
4270
4271         iommu_set_root_entry(iommu);
4272         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4273         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4274         iommu_enable_translation(iommu);
4275
4276         iommu_disable_protect_mem_regions(iommu);
4277         return 0;
4278
4279 disable_iommu:
4280         disable_dmar_iommu(iommu);
4281 out:
4282         free_dmar_iommu(iommu);
4283         return ret;
4284 }
4285
4286 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4287 {
4288         int ret = 0;
4289         struct intel_iommu *iommu = dmaru->iommu;
4290
4291         if (!intel_iommu_enabled)
4292                 return 0;
4293         if (iommu == NULL)
4294                 return -EINVAL;
4295
4296         if (insert) {
4297                 ret = intel_iommu_add(dmaru);
4298         } else {
4299                 disable_dmar_iommu(iommu);
4300                 free_dmar_iommu(iommu);
4301         }
4302
4303         return ret;
4304 }
4305
4306 static void intel_iommu_free_dmars(void)
4307 {
4308         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4309         struct dmar_atsr_unit *atsru, *atsr_n;
4310
4311         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4312                 list_del(&rmrru->list);
4313                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4314                 kfree(rmrru);
4315         }
4316
4317         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4318                 list_del(&atsru->list);
4319                 intel_iommu_free_atsr(atsru);
4320         }
4321 }
4322
4323 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4324 {
4325         int i, ret = 1;
4326         struct pci_bus *bus;
4327         struct pci_dev *bridge = NULL;
4328         struct device *tmp;
4329         struct acpi_dmar_atsr *atsr;
4330         struct dmar_atsr_unit *atsru;
4331
4332         dev = pci_physfn(dev);
4333         for (bus = dev->bus; bus; bus = bus->parent) {
4334                 bridge = bus->self;
4335                 /* If it's an integrated device, allow ATS */
4336                 if (!bridge)
4337                         return 1;
4338                 /* Connected via non-PCIe: no ATS */
4339                 if (!pci_is_pcie(bridge) ||
4340                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4341                         return 0;
4342                 /* If we found the root port, look it up in the ATSR */
4343                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4344                         break;
4345         }
4346
4347         rcu_read_lock();
4348         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4349                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4350                 if (atsr->segment != pci_domain_nr(dev->bus))
4351                         continue;
4352
4353                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4354                         if (tmp == &bridge->dev)
4355                                 goto out;
4356
4357                 if (atsru->include_all)
4358                         goto out;
4359         }
4360         ret = 0;
4361 out:
4362         rcu_read_unlock();
4363
4364         return ret;
4365 }
4366
4367 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4368 {
4369         int ret = 0;
4370         struct dmar_rmrr_unit *rmrru;
4371         struct dmar_atsr_unit *atsru;
4372         struct acpi_dmar_atsr *atsr;
4373         struct acpi_dmar_reserved_memory *rmrr;
4374
4375         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4376                 return 0;
4377
4378         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4379                 rmrr = container_of(rmrru->hdr,
4380                                     struct acpi_dmar_reserved_memory, header);
4381                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4382                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4383                                 ((void *)rmrr) + rmrr->header.length,
4384                                 rmrr->segment, rmrru->devices,
4385                                 rmrru->devices_cnt);
4386                         if(ret < 0)
4387                                 return ret;
4388                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4389                         dmar_remove_dev_scope(info, rmrr->segment,
4390                                 rmrru->devices, rmrru->devices_cnt);
4391                 }
4392         }
4393
4394         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4395                 if (atsru->include_all)
4396                         continue;
4397
4398                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4399                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4400                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4401                                         (void *)atsr + atsr->header.length,
4402                                         atsr->segment, atsru->devices,
4403                                         atsru->devices_cnt);
4404                         if (ret > 0)
4405                                 break;
4406                         else if(ret < 0)
4407                                 return ret;
4408                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4409                         if (dmar_remove_dev_scope(info, atsr->segment,
4410                                         atsru->devices, atsru->devices_cnt))
4411                                 break;
4412                 }
4413         }
4414
4415         return 0;
4416 }
4417
4418 /*
4419  * Here we only respond to action of unbound device from driver.
4420  *
4421  * Added device is not attached to its DMAR domain here yet. That will happen
4422  * when mapping the device to iova.
4423  */
4424 static int device_notifier(struct notifier_block *nb,
4425                                   unsigned long action, void *data)
4426 {
4427         struct device *dev = data;
4428         struct dmar_domain *domain;
4429
4430         if (iommu_dummy(dev))
4431                 return 0;
4432
4433         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4434                 return 0;
4435
4436         domain = find_domain(dev);
4437         if (!domain)
4438                 return 0;
4439
4440         dmar_remove_one_dev_info(domain, dev);
4441         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4442                 domain_exit(domain);
4443
4444         return 0;
4445 }
4446
4447 static struct notifier_block device_nb = {
4448         .notifier_call = device_notifier,
4449 };
4450
4451 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4452                                        unsigned long val, void *v)
4453 {
4454         struct memory_notify *mhp = v;
4455         unsigned long long start, end;
4456         unsigned long start_vpfn, last_vpfn;
4457
4458         switch (val) {
4459         case MEM_GOING_ONLINE:
4460                 start = mhp->start_pfn << PAGE_SHIFT;
4461                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4462                 if (iommu_domain_identity_map(si_domain, start, end)) {
4463                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4464                                 start, end);
4465                         return NOTIFY_BAD;
4466                 }
4467                 break;
4468
4469         case MEM_OFFLINE:
4470         case MEM_CANCEL_ONLINE:
4471                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4472                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4473                 while (start_vpfn <= last_vpfn) {
4474                         struct iova *iova;
4475                         struct dmar_drhd_unit *drhd;
4476                         struct intel_iommu *iommu;
4477                         struct page *freelist;
4478
4479                         iova = find_iova(&si_domain->iovad, start_vpfn);
4480                         if (iova == NULL) {
4481                                 pr_debug("Failed get IOVA for PFN %lx\n",
4482                                          start_vpfn);
4483                                 break;
4484                         }
4485
4486                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4487                                                      start_vpfn, last_vpfn);
4488                         if (iova == NULL) {
4489                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4490                                         start_vpfn, last_vpfn);
4491                                 return NOTIFY_BAD;
4492                         }
4493
4494                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4495                                                iova->pfn_hi);
4496
4497                         rcu_read_lock();
4498                         for_each_active_iommu(iommu, drhd)
4499                                 iommu_flush_iotlb_psi(iommu, si_domain,
4500                                         iova->pfn_lo, iova_size(iova),
4501                                         !freelist, 0);
4502                         rcu_read_unlock();
4503                         dma_free_pagelist(freelist);
4504
4505                         start_vpfn = iova->pfn_hi + 1;
4506                         free_iova_mem(iova);
4507                 }
4508                 break;
4509         }
4510
4511         return NOTIFY_OK;
4512 }
4513
4514 static struct notifier_block intel_iommu_memory_nb = {
4515         .notifier_call = intel_iommu_memory_notifier,
4516         .priority = 0
4517 };
4518
4519
4520 static ssize_t intel_iommu_show_version(struct device *dev,
4521                                         struct device_attribute *attr,
4522                                         char *buf)
4523 {
4524         struct intel_iommu *iommu = dev_get_drvdata(dev);
4525         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4526         return sprintf(buf, "%d:%d\n",
4527                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4528 }
4529 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4530
4531 static ssize_t intel_iommu_show_address(struct device *dev,
4532                                         struct device_attribute *attr,
4533                                         char *buf)
4534 {
4535         struct intel_iommu *iommu = dev_get_drvdata(dev);
4536         return sprintf(buf, "%llx\n", iommu->reg_phys);
4537 }
4538 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4539
4540 static ssize_t intel_iommu_show_cap(struct device *dev,
4541                                     struct device_attribute *attr,
4542                                     char *buf)
4543 {
4544         struct intel_iommu *iommu = dev_get_drvdata(dev);
4545         return sprintf(buf, "%llx\n", iommu->cap);
4546 }
4547 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4548
4549 static ssize_t intel_iommu_show_ecap(struct device *dev,
4550                                     struct device_attribute *attr,
4551                                     char *buf)
4552 {
4553         struct intel_iommu *iommu = dev_get_drvdata(dev);
4554         return sprintf(buf, "%llx\n", iommu->ecap);
4555 }
4556 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4557
4558 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4559                                       struct device_attribute *attr,
4560                                       char *buf)
4561 {
4562         struct intel_iommu *iommu = dev_get_drvdata(dev);
4563         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4564 }
4565 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4566
4567 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4568                                            struct device_attribute *attr,
4569                                            char *buf)
4570 {
4571         struct intel_iommu *iommu = dev_get_drvdata(dev);
4572         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4573                                                   cap_ndoms(iommu->cap)));
4574 }
4575 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4576
4577 static struct attribute *intel_iommu_attrs[] = {
4578         &dev_attr_version.attr,
4579         &dev_attr_address.attr,
4580         &dev_attr_cap.attr,
4581         &dev_attr_ecap.attr,
4582         &dev_attr_domains_supported.attr,
4583         &dev_attr_domains_used.attr,
4584         NULL,
4585 };
4586
4587 static struct attribute_group intel_iommu_group = {
4588         .name = "intel-iommu",
4589         .attrs = intel_iommu_attrs,
4590 };
4591
4592 const struct attribute_group *intel_iommu_groups[] = {
4593         &intel_iommu_group,
4594         NULL,
4595 };
4596
4597 int __init intel_iommu_init(void)
4598 {
4599         int ret = -ENODEV;
4600         struct dmar_drhd_unit *drhd;
4601         struct intel_iommu *iommu;
4602
4603         /* VT-d is required for a TXT/tboot launch, so enforce that */
4604         force_on = tboot_force_iommu();
4605
4606         if (iommu_init_mempool()) {
4607                 if (force_on)
4608                         panic("tboot: Failed to initialize iommu memory\n");
4609                 return -ENOMEM;
4610         }
4611
4612         down_write(&dmar_global_lock);
4613         if (dmar_table_init()) {
4614                 if (force_on)
4615                         panic("tboot: Failed to initialize DMAR table\n");
4616                 goto out_free_dmar;
4617         }
4618
4619         if (dmar_dev_scope_init() < 0) {
4620                 if (force_on)
4621                         panic("tboot: Failed to initialize DMAR device scope\n");
4622                 goto out_free_dmar;
4623         }
4624
4625         if (no_iommu || dmar_disabled)
4626                 goto out_free_dmar;
4627
4628         if (list_empty(&dmar_rmrr_units))
4629                 pr_info("No RMRR found\n");
4630
4631         if (list_empty(&dmar_atsr_units))
4632                 pr_info("No ATSR found\n");
4633
4634         if (dmar_init_reserved_ranges()) {
4635                 if (force_on)
4636                         panic("tboot: Failed to reserve iommu ranges\n");
4637                 goto out_free_reserved_range;
4638         }
4639
4640         init_no_remapping_devices();
4641
4642         ret = init_dmars();
4643         if (ret) {
4644                 if (force_on)
4645                         panic("tboot: Failed to initialize DMARs\n");
4646                 pr_err("Initialization failed\n");
4647                 goto out_free_reserved_range;
4648         }
4649         up_write(&dmar_global_lock);
4650         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4651
4652         init_timer(&unmap_timer);
4653 #ifdef CONFIG_SWIOTLB
4654         swiotlb = 0;
4655 #endif
4656         dma_ops = &intel_dma_ops;
4657
4658         init_iommu_pm_ops();
4659
4660         for_each_active_iommu(iommu, drhd)
4661                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4662                                                        intel_iommu_groups,
4663                                                        "%s", iommu->name);
4664
4665         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4666         bus_register_notifier(&pci_bus_type, &device_nb);
4667         if (si_domain && !hw_pass_through)
4668                 register_memory_notifier(&intel_iommu_memory_nb);
4669
4670         intel_iommu_enabled = 1;
4671
4672         return 0;
4673
4674 out_free_reserved_range:
4675         put_iova_domain(&reserved_iova_list);
4676 out_free_dmar:
4677         intel_iommu_free_dmars();
4678         up_write(&dmar_global_lock);
4679         iommu_exit_mempool();
4680         return ret;
4681 }
4682
4683 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4684 {
4685         struct intel_iommu *iommu = opaque;
4686
4687         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4688         return 0;
4689 }
4690
4691 /*
4692  * NB - intel-iommu lacks any sort of reference counting for the users of
4693  * dependent devices.  If multiple endpoints have intersecting dependent
4694  * devices, unbinding the driver from any one of them will possibly leave
4695  * the others unable to operate.
4696  */
4697 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4698 {
4699         if (!iommu || !dev || !dev_is_pci(dev))
4700                 return;
4701
4702         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4703 }
4704
4705 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4706 {
4707         struct intel_iommu *iommu;
4708         unsigned long flags;
4709
4710         assert_spin_locked(&device_domain_lock);
4711
4712         if (WARN_ON(!info))
4713                 return;
4714
4715         iommu = info->iommu;
4716
4717         if (info->dev) {
4718                 iommu_disable_dev_iotlb(info);
4719                 domain_context_clear(iommu, info->dev);
4720         }
4721
4722         unlink_domain_info(info);
4723
4724         spin_lock_irqsave(&iommu->lock, flags);
4725         domain_detach_iommu(info->domain, iommu);
4726         spin_unlock_irqrestore(&iommu->lock, flags);
4727
4728         free_devinfo_mem(info);
4729 }
4730
4731 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4732                                      struct device *dev)
4733 {
4734         struct device_domain_info *info;
4735         unsigned long flags;
4736
4737         spin_lock_irqsave(&device_domain_lock, flags);
4738         info = dev->archdata.iommu;
4739         __dmar_remove_one_dev_info(info);
4740         spin_unlock_irqrestore(&device_domain_lock, flags);
4741 }
4742
4743 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4744 {
4745         int adjust_width;
4746
4747         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4748                         DMA_32BIT_PFN);
4749         domain_reserve_special_ranges(domain);
4750
4751         /* calculate AGAW */
4752         domain->gaw = guest_width;
4753         adjust_width = guestwidth_to_adjustwidth(guest_width);
4754         domain->agaw = width_to_agaw(adjust_width);
4755
4756         domain->iommu_coherency = 0;
4757         domain->iommu_snooping = 0;
4758         domain->iommu_superpage = 0;
4759         domain->max_addr = 0;
4760
4761         /* always allocate the top pgd */
4762         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4763         if (!domain->pgd)
4764                 return -ENOMEM;
4765         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4766         return 0;
4767 }
4768
4769 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4770 {
4771         struct dmar_domain *dmar_domain;
4772         struct iommu_domain *domain;
4773
4774         if (type != IOMMU_DOMAIN_UNMANAGED)
4775                 return NULL;
4776
4777         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4778         if (!dmar_domain) {
4779                 pr_err("Can't allocate dmar_domain\n");
4780                 return NULL;
4781         }
4782         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4783                 pr_err("Domain initialization failed\n");
4784                 domain_exit(dmar_domain);
4785                 return NULL;
4786         }
4787         domain_update_iommu_cap(dmar_domain);
4788
4789         domain = &dmar_domain->domain;
4790         domain->geometry.aperture_start = 0;
4791         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4792         domain->geometry.force_aperture = true;
4793
4794         return domain;
4795 }
4796
4797 static void intel_iommu_domain_free(struct iommu_domain *domain)
4798 {
4799         domain_exit(to_dmar_domain(domain));
4800 }
4801
4802 static int intel_iommu_attach_device(struct iommu_domain *domain,
4803                                      struct device *dev)
4804 {
4805         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4806         struct intel_iommu *iommu;
4807         int addr_width;
4808         u8 bus, devfn;
4809
4810         if (device_is_rmrr_locked(dev)) {
4811                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4812                 return -EPERM;
4813         }
4814
4815         /* normally dev is not mapped */
4816         if (unlikely(domain_context_mapped(dev))) {
4817                 struct dmar_domain *old_domain;
4818
4819                 old_domain = find_domain(dev);
4820                 if (old_domain) {
4821                         rcu_read_lock();
4822                         dmar_remove_one_dev_info(old_domain, dev);
4823                         rcu_read_unlock();
4824
4825                         if (!domain_type_is_vm_or_si(old_domain) &&
4826                              list_empty(&old_domain->devices))
4827                                 domain_exit(old_domain);
4828                 }
4829         }
4830
4831         iommu = device_to_iommu(dev, &bus, &devfn);
4832         if (!iommu)
4833                 return -ENODEV;
4834
4835         /* check if this iommu agaw is sufficient for max mapped address */
4836         addr_width = agaw_to_width(iommu->agaw);
4837         if (addr_width > cap_mgaw(iommu->cap))
4838                 addr_width = cap_mgaw(iommu->cap);
4839
4840         if (dmar_domain->max_addr > (1LL << addr_width)) {
4841                 pr_err("%s: iommu width (%d) is not "
4842                        "sufficient for the mapped address (%llx)\n",
4843                        __func__, addr_width, dmar_domain->max_addr);
4844                 return -EFAULT;
4845         }
4846         dmar_domain->gaw = addr_width;
4847
4848         /*
4849          * Knock out extra levels of page tables if necessary
4850          */
4851         while (iommu->agaw < dmar_domain->agaw) {
4852                 struct dma_pte *pte;
4853
4854                 pte = dmar_domain->pgd;
4855                 if (dma_pte_present(pte)) {
4856                         dmar_domain->pgd = (struct dma_pte *)
4857                                 phys_to_virt(dma_pte_addr(pte));
4858                         free_pgtable_page(pte);
4859                 }
4860                 dmar_domain->agaw--;
4861         }
4862
4863         return domain_add_dev_info(dmar_domain, dev);
4864 }
4865
4866 static void intel_iommu_detach_device(struct iommu_domain *domain,
4867                                       struct device *dev)
4868 {
4869         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4870 }
4871
4872 static int intel_iommu_map(struct iommu_domain *domain,
4873                            unsigned long iova, phys_addr_t hpa,
4874                            size_t size, int iommu_prot)
4875 {
4876         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4877         u64 max_addr;
4878         int prot = 0;
4879         int ret;
4880
4881         if (iommu_prot & IOMMU_READ)
4882                 prot |= DMA_PTE_READ;
4883         if (iommu_prot & IOMMU_WRITE)
4884                 prot |= DMA_PTE_WRITE;
4885         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4886                 prot |= DMA_PTE_SNP;
4887
4888         max_addr = iova + size;
4889         if (dmar_domain->max_addr < max_addr) {
4890                 u64 end;
4891
4892                 /* check if minimum agaw is sufficient for mapped address */
4893                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4894                 if (end < max_addr) {
4895                         pr_err("%s: iommu width (%d) is not "
4896                                "sufficient for the mapped address (%llx)\n",
4897                                __func__, dmar_domain->gaw, max_addr);
4898                         return -EFAULT;
4899                 }
4900                 dmar_domain->max_addr = max_addr;
4901         }
4902         /* Round up size to next multiple of PAGE_SIZE, if it and
4903            the low bits of hpa would take us onto the next page */
4904         size = aligned_nrpages(hpa, size);
4905         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4906                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4907         return ret;
4908 }
4909
4910 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4911                                 unsigned long iova, size_t size)
4912 {
4913         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4914         struct page *freelist = NULL;
4915         struct intel_iommu *iommu;
4916         unsigned long start_pfn, last_pfn;
4917         unsigned int npages;
4918         int iommu_id, level = 0;
4919
4920         /* Cope with horrid API which requires us to unmap more than the
4921            size argument if it happens to be a large-page mapping. */
4922         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4923
4924         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4925                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4926
4927         start_pfn = iova >> VTD_PAGE_SHIFT;
4928         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4929
4930         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4931
4932         npages = last_pfn - start_pfn + 1;
4933
4934         for_each_domain_iommu(iommu_id, dmar_domain) {
4935                 iommu = g_iommus[iommu_id];
4936
4937                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4938                                       start_pfn, npages, !freelist, 0);
4939         }
4940
4941         dma_free_pagelist(freelist);
4942
4943         if (dmar_domain->max_addr == iova + size)
4944                 dmar_domain->max_addr = iova;
4945
4946         return size;
4947 }
4948
4949 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4950                                             dma_addr_t iova)
4951 {
4952         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4953         struct dma_pte *pte;
4954         int level = 0;
4955         u64 phys = 0;
4956
4957         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4958         if (pte)
4959                 phys = dma_pte_addr(pte);
4960
4961         return phys;
4962 }
4963
4964 static bool intel_iommu_capable(enum iommu_cap cap)
4965 {
4966         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4967                 return domain_update_iommu_snooping(NULL) == 1;
4968         if (cap == IOMMU_CAP_INTR_REMAP)
4969                 return irq_remapping_enabled == 1;
4970
4971         return false;
4972 }
4973
4974 static int intel_iommu_add_device(struct device *dev)
4975 {
4976         struct intel_iommu *iommu;
4977         struct iommu_group *group;
4978         u8 bus, devfn;
4979
4980         iommu = device_to_iommu(dev, &bus, &devfn);
4981         if (!iommu)
4982                 return -ENODEV;
4983
4984         iommu_device_link(iommu->iommu_dev, dev);
4985
4986         group = iommu_group_get_for_dev(dev);
4987
4988         if (IS_ERR(group))
4989                 return PTR_ERR(group);
4990
4991         iommu_group_put(group);
4992         return 0;
4993 }
4994
4995 static void intel_iommu_remove_device(struct device *dev)
4996 {
4997         struct intel_iommu *iommu;
4998         u8 bus, devfn;
4999
5000         iommu = device_to_iommu(dev, &bus, &devfn);
5001         if (!iommu)
5002                 return;
5003
5004         iommu_group_remove_device(dev);
5005
5006         iommu_device_unlink(iommu->iommu_dev, dev);
5007 }
5008
5009 #ifdef CONFIG_INTEL_IOMMU_SVM
5010 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5011 {
5012         struct device_domain_info *info;
5013         struct context_entry *context;
5014         struct dmar_domain *domain;
5015         unsigned long flags;
5016         u64 ctx_lo;
5017         int ret;
5018
5019         domain = get_valid_domain_for_dev(sdev->dev);
5020         if (!domain)
5021                 return -EINVAL;
5022
5023         spin_lock_irqsave(&device_domain_lock, flags);
5024         spin_lock(&iommu->lock);
5025
5026         ret = -EINVAL;
5027         info = sdev->dev->archdata.iommu;
5028         if (!info || !info->pasid_supported)
5029                 goto out;
5030
5031         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5032         if (WARN_ON(!context))
5033                 goto out;
5034
5035         ctx_lo = context[0].lo;
5036
5037         sdev->did = domain->iommu_did[iommu->seq_id];
5038         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5039
5040         if (!(ctx_lo & CONTEXT_PASIDE)) {
5041                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5042                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap);
5043                 wmb();
5044                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5045                  * extended to permit requests-with-PASID if the PASIDE bit
5046                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5047                  * however, the PASIDE bit is ignored and requests-with-PASID
5048                  * are unconditionally blocked. Which makes less sense.
5049                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5050                  * "guest mode" translation types depending on whether ATS
5051                  * is available or not. Annoyingly, we can't use the new
5052                  * modes *unless* PASIDE is set. */
5053                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5054                         ctx_lo &= ~CONTEXT_TT_MASK;
5055                         if (info->ats_supported)
5056                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5057                         else
5058                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5059                 }
5060                 ctx_lo |= CONTEXT_PASIDE;
5061                 if (iommu->pasid_state_table)
5062                         ctx_lo |= CONTEXT_DINVE;
5063                 if (info->pri_supported)
5064                         ctx_lo |= CONTEXT_PRS;
5065                 context[0].lo = ctx_lo;
5066                 wmb();
5067                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5068                                            DMA_CCMD_MASK_NOBIT,
5069                                            DMA_CCMD_DEVICE_INVL);
5070         }
5071
5072         /* Enable PASID support in the device, if it wasn't already */
5073         if (!info->pasid_enabled)
5074                 iommu_enable_dev_iotlb(info);
5075
5076         if (info->ats_enabled) {
5077                 sdev->dev_iotlb = 1;
5078                 sdev->qdep = info->ats_qdep;
5079                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5080                         sdev->qdep = 0;
5081         }
5082         ret = 0;
5083
5084  out:
5085         spin_unlock(&iommu->lock);
5086         spin_unlock_irqrestore(&device_domain_lock, flags);
5087
5088         return ret;
5089 }
5090
5091 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5092 {
5093         struct intel_iommu *iommu;
5094         u8 bus, devfn;
5095
5096         if (iommu_dummy(dev)) {
5097                 dev_warn(dev,
5098                          "No IOMMU translation for device; cannot enable SVM\n");
5099                 return NULL;
5100         }
5101
5102         iommu = device_to_iommu(dev, &bus, &devfn);
5103         if ((!iommu)) {
5104                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5105                 return NULL;
5106         }
5107
5108         if (!iommu->pasid_table) {
5109                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5110                 return NULL;
5111         }
5112
5113         return iommu;
5114 }
5115 #endif /* CONFIG_INTEL_IOMMU_SVM */
5116
5117 static const struct iommu_ops intel_iommu_ops = {
5118         .capable        = intel_iommu_capable,
5119         .domain_alloc   = intel_iommu_domain_alloc,
5120         .domain_free    = intel_iommu_domain_free,
5121         .attach_dev     = intel_iommu_attach_device,
5122         .detach_dev     = intel_iommu_detach_device,
5123         .map            = intel_iommu_map,
5124         .unmap          = intel_iommu_unmap,
5125         .map_sg         = default_iommu_map_sg,
5126         .iova_to_phys   = intel_iommu_iova_to_phys,
5127         .add_device     = intel_iommu_add_device,
5128         .remove_device  = intel_iommu_remove_device,
5129         .device_group   = pci_device_group,
5130         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
5131 };
5132
5133 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5134 {
5135         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5136         pr_info("Disabling IOMMU for graphics on this chipset\n");
5137         dmar_map_gfx = 0;
5138 }
5139
5140 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5141 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5142 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5147
5148 static void quirk_iommu_rwbf(struct pci_dev *dev)
5149 {
5150         /*
5151          * Mobile 4 Series Chipset neglects to set RWBF capability,
5152          * but needs it. Same seems to hold for the desktop versions.
5153          */
5154         pr_info("Forcing write-buffer flush capability\n");
5155         rwbf_quirk = 1;
5156 }
5157
5158 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5159 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5160 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5161 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5162 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5163 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5164 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5165
5166 #define GGC 0x52
5167 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5168 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5169 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5170 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5171 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5172 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5173 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5174 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5175
5176 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5177 {
5178         unsigned short ggc;
5179
5180         if (pci_read_config_word(dev, GGC, &ggc))
5181                 return;
5182
5183         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5184                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5185                 dmar_map_gfx = 0;
5186         } else if (dmar_map_gfx) {
5187                 /* we have to ensure the gfx device is idle before we flush */
5188                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5189                 intel_iommu_strict = 1;
5190        }
5191 }
5192 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5194 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5196
5197 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5198    ISOCH DMAR unit for the Azalia sound device, but not give it any
5199    TLB entries, which causes it to deadlock. Check for that.  We do
5200    this in a function called from init_dmars(), instead of in a PCI
5201    quirk, because we don't want to print the obnoxious "BIOS broken"
5202    message if VT-d is actually disabled.
5203 */
5204 static void __init check_tylersburg_isoch(void)
5205 {
5206         struct pci_dev *pdev;
5207         uint32_t vtisochctrl;
5208
5209         /* If there's no Azalia in the system anyway, forget it. */
5210         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5211         if (!pdev)
5212                 return;
5213         pci_dev_put(pdev);
5214
5215         /* System Management Registers. Might be hidden, in which case
5216            we can't do the sanity check. But that's OK, because the
5217            known-broken BIOSes _don't_ actually hide it, so far. */
5218         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5219         if (!pdev)
5220                 return;
5221
5222         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5223                 pci_dev_put(pdev);
5224                 return;
5225         }
5226
5227         pci_dev_put(pdev);
5228
5229         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5230         if (vtisochctrl & 1)
5231                 return;
5232
5233         /* Drop all bits other than the number of TLB entries */
5234         vtisochctrl &= 0x1c;
5235
5236         /* If we have the recommended number of TLB entries (16), fine. */
5237         if (vtisochctrl == 0x10)
5238                 return;
5239
5240         /* Zero TLB entries? You get to ride the short bus to school. */
5241         if (!vtisochctrl) {
5242                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5243                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5244                      dmi_get_system_info(DMI_BIOS_VENDOR),
5245                      dmi_get_system_info(DMI_BIOS_VERSION),
5246                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5247                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5248                 return;
5249         }
5250
5251         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5252                vtisochctrl);
5253 }