intel-iommu: move iommu_prepare_gfx_mapping() out of dma_remapping.h
[firefly-linux-kernel-4.4.55.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /*
61  * 0: Present
62  * 1-11: Reserved
63  * 12-63: Context Ptr (12 - (haw-1))
64  * 64-127: Reserved
65  */
66 struct root_entry {
67         u64     val;
68         u64     rsvd1;
69 };
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
72 {
73         return (root->val & 1);
74 }
75 static inline void set_root_present(struct root_entry *root)
76 {
77         root->val |= 1;
78 }
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
80 {
81         root->val |= value & VTD_PAGE_MASK;
82 }
83
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
86 {
87         return (struct context_entry *)
88                 (root_present(root)?phys_to_virt(
89                 root->val & VTD_PAGE_MASK) :
90                 NULL);
91 }
92
93 /*
94  * low 64 bits:
95  * 0: present
96  * 1: fault processing disable
97  * 2-3: translation type
98  * 12-63: address space root
99  * high 64 bits:
100  * 0-2: address width
101  * 3-6: aval
102  * 8-23: domain id
103  */
104 struct context_entry {
105         u64 lo;
106         u64 hi;
107 };
108 #define context_present(c) ((c).lo & 1)
109 #define context_fault_disable(c) (((c).lo >> 1) & 1)
110 #define context_translation_type(c) (((c).lo >> 2) & 3)
111 #define context_address_root(c) ((c).lo & VTD_PAGE_MASK)
112 #define context_address_width(c) ((c).hi &  7)
113 #define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
114
115 #define context_set_present(c) do {(c).lo |= 1;} while (0)
116 #define context_set_fault_enable(c) \
117         do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
118 #define context_set_translation_type(c, val) \
119         do { \
120                 (c).lo &= (((u64)-1) << 4) | 3; \
121                 (c).lo |= ((val) & 3) << 2; \
122         } while (0)
123 #define CONTEXT_TT_MULTI_LEVEL 0
124 #define context_set_address_root(c, val) \
125         do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0)
126 #define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
127 #define context_set_domain_id(c, val) \
128         do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
129 #define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
130
131 /*
132  * 0: readable
133  * 1: writable
134  * 2-6: reserved
135  * 7: super page
136  * 8-11: available
137  * 12-63: Host physcial address
138  */
139 struct dma_pte {
140         u64 val;
141 };
142 #define dma_clear_pte(p)        do {(p).val = 0;} while (0)
143
144 #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
145 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
146 #define dma_set_pte_prot(p, prot) \
147                 do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
148 #define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK)
149 #define dma_set_pte_addr(p, addr) do {\
150                 (p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
151 #define dma_pte_present(p) (((p).val & 3) != 0)
152
153 struct dmar_domain {
154         int     id;                     /* domain id */
155         struct intel_iommu *iommu;      /* back pointer to owning iommu */
156
157         struct list_head devices;       /* all devices' list */
158         struct iova_domain iovad;       /* iova's that belong to this domain */
159
160         struct dma_pte  *pgd;           /* virtual address */
161         spinlock_t      mapping_lock;   /* page table lock */
162         int             gaw;            /* max guest address width */
163
164         /* adjusted guest address width, 0 is level 2 30-bit */
165         int             agaw;
166
167 #define DOMAIN_FLAG_MULTIPLE_DEVICES 1
168         int             flags;
169 };
170
171 /* PCI domain-device relationship */
172 struct device_domain_info {
173         struct list_head link;  /* link to domain siblings */
174         struct list_head global; /* link to global list */
175         u8 bus;                 /* PCI bus numer */
176         u8 devfn;               /* PCI devfn number */
177         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
178         struct dmar_domain *domain; /* pointer to domain */
179 };
180
181 static void flush_unmaps_timeout(unsigned long data);
182
183 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
184
185 #define HIGH_WATER_MARK 250
186 struct deferred_flush_tables {
187         int next;
188         struct iova *iova[HIGH_WATER_MARK];
189         struct dmar_domain *domain[HIGH_WATER_MARK];
190 };
191
192 static struct deferred_flush_tables *deferred_flush;
193
194 /* bitmap for indexing intel_iommus */
195 static int g_num_of_iommus;
196
197 static DEFINE_SPINLOCK(async_umap_flush_lock);
198 static LIST_HEAD(unmaps_to_do);
199
200 static int timer_on;
201 static long list_size;
202
203 static void domain_remove_dev_info(struct dmar_domain *domain);
204
205 int dmar_disabled;
206 static int __initdata dmar_map_gfx = 1;
207 static int dmar_forcedac;
208 static int intel_iommu_strict;
209
210 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
211 static DEFINE_SPINLOCK(device_domain_lock);
212 static LIST_HEAD(device_domain_list);
213
214 static int __init intel_iommu_setup(char *str)
215 {
216         if (!str)
217                 return -EINVAL;
218         while (*str) {
219                 if (!strncmp(str, "off", 3)) {
220                         dmar_disabled = 1;
221                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
222                 } else if (!strncmp(str, "igfx_off", 8)) {
223                         dmar_map_gfx = 0;
224                         printk(KERN_INFO
225                                 "Intel-IOMMU: disable GFX device mapping\n");
226                 } else if (!strncmp(str, "forcedac", 8)) {
227                         printk(KERN_INFO
228                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
229                         dmar_forcedac = 1;
230                 } else if (!strncmp(str, "strict", 6)) {
231                         printk(KERN_INFO
232                                 "Intel-IOMMU: disable batched IOTLB flush\n");
233                         intel_iommu_strict = 1;
234                 }
235
236                 str += strcspn(str, ",");
237                 while (*str == ',')
238                         str++;
239         }
240         return 0;
241 }
242 __setup("intel_iommu=", intel_iommu_setup);
243
244 static struct kmem_cache *iommu_domain_cache;
245 static struct kmem_cache *iommu_devinfo_cache;
246 static struct kmem_cache *iommu_iova_cache;
247
248 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
249 {
250         unsigned int flags;
251         void *vaddr;
252
253         /* trying to avoid low memory issues */
254         flags = current->flags & PF_MEMALLOC;
255         current->flags |= PF_MEMALLOC;
256         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
257         current->flags &= (~PF_MEMALLOC | flags);
258         return vaddr;
259 }
260
261
262 static inline void *alloc_pgtable_page(void)
263 {
264         unsigned int flags;
265         void *vaddr;
266
267         /* trying to avoid low memory issues */
268         flags = current->flags & PF_MEMALLOC;
269         current->flags |= PF_MEMALLOC;
270         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
271         current->flags &= (~PF_MEMALLOC | flags);
272         return vaddr;
273 }
274
275 static inline void free_pgtable_page(void *vaddr)
276 {
277         free_page((unsigned long)vaddr);
278 }
279
280 static inline void *alloc_domain_mem(void)
281 {
282         return iommu_kmem_cache_alloc(iommu_domain_cache);
283 }
284
285 static void free_domain_mem(void *vaddr)
286 {
287         kmem_cache_free(iommu_domain_cache, vaddr);
288 }
289
290 static inline void * alloc_devinfo_mem(void)
291 {
292         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
293 }
294
295 static inline void free_devinfo_mem(void *vaddr)
296 {
297         kmem_cache_free(iommu_devinfo_cache, vaddr);
298 }
299
300 struct iova *alloc_iova_mem(void)
301 {
302         return iommu_kmem_cache_alloc(iommu_iova_cache);
303 }
304
305 void free_iova_mem(struct iova *iova)
306 {
307         kmem_cache_free(iommu_iova_cache, iova);
308 }
309
310 /* Gets context entry for a given bus and devfn */
311 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
312                 u8 bus, u8 devfn)
313 {
314         struct root_entry *root;
315         struct context_entry *context;
316         unsigned long phy_addr;
317         unsigned long flags;
318
319         spin_lock_irqsave(&iommu->lock, flags);
320         root = &iommu->root_entry[bus];
321         context = get_context_addr_from_root(root);
322         if (!context) {
323                 context = (struct context_entry *)alloc_pgtable_page();
324                 if (!context) {
325                         spin_unlock_irqrestore(&iommu->lock, flags);
326                         return NULL;
327                 }
328                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
329                 phy_addr = virt_to_phys((void *)context);
330                 set_root_value(root, phy_addr);
331                 set_root_present(root);
332                 __iommu_flush_cache(iommu, root, sizeof(*root));
333         }
334         spin_unlock_irqrestore(&iommu->lock, flags);
335         return &context[devfn];
336 }
337
338 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
339 {
340         struct root_entry *root;
341         struct context_entry *context;
342         int ret;
343         unsigned long flags;
344
345         spin_lock_irqsave(&iommu->lock, flags);
346         root = &iommu->root_entry[bus];
347         context = get_context_addr_from_root(root);
348         if (!context) {
349                 ret = 0;
350                 goto out;
351         }
352         ret = context_present(context[devfn]);
353 out:
354         spin_unlock_irqrestore(&iommu->lock, flags);
355         return ret;
356 }
357
358 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
359 {
360         struct root_entry *root;
361         struct context_entry *context;
362         unsigned long flags;
363
364         spin_lock_irqsave(&iommu->lock, flags);
365         root = &iommu->root_entry[bus];
366         context = get_context_addr_from_root(root);
367         if (context) {
368                 context_clear_entry(context[devfn]);
369                 __iommu_flush_cache(iommu, &context[devfn], \
370                         sizeof(*context));
371         }
372         spin_unlock_irqrestore(&iommu->lock, flags);
373 }
374
375 static void free_context_table(struct intel_iommu *iommu)
376 {
377         struct root_entry *root;
378         int i;
379         unsigned long flags;
380         struct context_entry *context;
381
382         spin_lock_irqsave(&iommu->lock, flags);
383         if (!iommu->root_entry) {
384                 goto out;
385         }
386         for (i = 0; i < ROOT_ENTRY_NR; i++) {
387                 root = &iommu->root_entry[i];
388                 context = get_context_addr_from_root(root);
389                 if (context)
390                         free_pgtable_page(context);
391         }
392         free_pgtable_page(iommu->root_entry);
393         iommu->root_entry = NULL;
394 out:
395         spin_unlock_irqrestore(&iommu->lock, flags);
396 }
397
398 /* page table handling */
399 #define LEVEL_STRIDE            (9)
400 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
401
402 static inline int agaw_to_level(int agaw)
403 {
404         return agaw + 2;
405 }
406
407 static inline int agaw_to_width(int agaw)
408 {
409         return 30 + agaw * LEVEL_STRIDE;
410
411 }
412
413 static inline int width_to_agaw(int width)
414 {
415         return (width - 30) / LEVEL_STRIDE;
416 }
417
418 static inline unsigned int level_to_offset_bits(int level)
419 {
420         return (12 + (level - 1) * LEVEL_STRIDE);
421 }
422
423 static inline int address_level_offset(u64 addr, int level)
424 {
425         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
426 }
427
428 static inline u64 level_mask(int level)
429 {
430         return ((u64)-1 << level_to_offset_bits(level));
431 }
432
433 static inline u64 level_size(int level)
434 {
435         return ((u64)1 << level_to_offset_bits(level));
436 }
437
438 static inline u64 align_to_level(u64 addr, int level)
439 {
440         return ((addr + level_size(level) - 1) & level_mask(level));
441 }
442
443 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
444 {
445         int addr_width = agaw_to_width(domain->agaw);
446         struct dma_pte *parent, *pte = NULL;
447         int level = agaw_to_level(domain->agaw);
448         int offset;
449         unsigned long flags;
450
451         BUG_ON(!domain->pgd);
452
453         addr &= (((u64)1) << addr_width) - 1;
454         parent = domain->pgd;
455
456         spin_lock_irqsave(&domain->mapping_lock, flags);
457         while (level > 0) {
458                 void *tmp_page;
459
460                 offset = address_level_offset(addr, level);
461                 pte = &parent[offset];
462                 if (level == 1)
463                         break;
464
465                 if (!dma_pte_present(*pte)) {
466                         tmp_page = alloc_pgtable_page();
467
468                         if (!tmp_page) {
469                                 spin_unlock_irqrestore(&domain->mapping_lock,
470                                         flags);
471                                 return NULL;
472                         }
473                         __iommu_flush_cache(domain->iommu, tmp_page,
474                                         PAGE_SIZE);
475                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
476                         /*
477                          * high level table always sets r/w, last level page
478                          * table control read/write
479                          */
480                         dma_set_pte_readable(*pte);
481                         dma_set_pte_writable(*pte);
482                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
483                 }
484                 parent = phys_to_virt(dma_pte_addr(*pte));
485                 level--;
486         }
487
488         spin_unlock_irqrestore(&domain->mapping_lock, flags);
489         return pte;
490 }
491
492 /* return address's pte at specific level */
493 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
494                 int level)
495 {
496         struct dma_pte *parent, *pte = NULL;
497         int total = agaw_to_level(domain->agaw);
498         int offset;
499
500         parent = domain->pgd;
501         while (level <= total) {
502                 offset = address_level_offset(addr, total);
503                 pte = &parent[offset];
504                 if (level == total)
505                         return pte;
506
507                 if (!dma_pte_present(*pte))
508                         break;
509                 parent = phys_to_virt(dma_pte_addr(*pte));
510                 total--;
511         }
512         return NULL;
513 }
514
515 /* clear one page's page table */
516 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
517 {
518         struct dma_pte *pte = NULL;
519
520         /* get last level pte */
521         pte = dma_addr_level_pte(domain, addr, 1);
522
523         if (pte) {
524                 dma_clear_pte(*pte);
525                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
526         }
527 }
528
529 /* clear last level pte, a tlb flush should be followed */
530 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
531 {
532         int addr_width = agaw_to_width(domain->agaw);
533
534         start &= (((u64)1) << addr_width) - 1;
535         end &= (((u64)1) << addr_width) - 1;
536         /* in case it's partial page */
537         start = PAGE_ALIGN(start);
538         end &= PAGE_MASK;
539
540         /* we don't need lock here, nobody else touches the iova range */
541         while (start < end) {
542                 dma_pte_clear_one(domain, start);
543                 start += VTD_PAGE_SIZE;
544         }
545 }
546
547 /* free page table pages. last level pte should already be cleared */
548 static void dma_pte_free_pagetable(struct dmar_domain *domain,
549         u64 start, u64 end)
550 {
551         int addr_width = agaw_to_width(domain->agaw);
552         struct dma_pte *pte;
553         int total = agaw_to_level(domain->agaw);
554         int level;
555         u64 tmp;
556
557         start &= (((u64)1) << addr_width) - 1;
558         end &= (((u64)1) << addr_width) - 1;
559
560         /* we don't need lock here, nobody else touches the iova range */
561         level = 2;
562         while (level <= total) {
563                 tmp = align_to_level(start, level);
564                 if (tmp >= end || (tmp + level_size(level) > end))
565                         return;
566
567                 while (tmp < end) {
568                         pte = dma_addr_level_pte(domain, tmp, level);
569                         if (pte) {
570                                 free_pgtable_page(
571                                         phys_to_virt(dma_pte_addr(*pte)));
572                                 dma_clear_pte(*pte);
573                                 __iommu_flush_cache(domain->iommu,
574                                                 pte, sizeof(*pte));
575                         }
576                         tmp += level_size(level);
577                 }
578                 level++;
579         }
580         /* free pgd */
581         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
582                 free_pgtable_page(domain->pgd);
583                 domain->pgd = NULL;
584         }
585 }
586
587 /* iommu handling */
588 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
589 {
590         struct root_entry *root;
591         unsigned long flags;
592
593         root = (struct root_entry *)alloc_pgtable_page();
594         if (!root)
595                 return -ENOMEM;
596
597         __iommu_flush_cache(iommu, root, ROOT_SIZE);
598
599         spin_lock_irqsave(&iommu->lock, flags);
600         iommu->root_entry = root;
601         spin_unlock_irqrestore(&iommu->lock, flags);
602
603         return 0;
604 }
605
606 static void iommu_set_root_entry(struct intel_iommu *iommu)
607 {
608         void *addr;
609         u32 cmd, sts;
610         unsigned long flag;
611
612         addr = iommu->root_entry;
613
614         spin_lock_irqsave(&iommu->register_lock, flag);
615         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
616
617         cmd = iommu->gcmd | DMA_GCMD_SRTP;
618         writel(cmd, iommu->reg + DMAR_GCMD_REG);
619
620         /* Make sure hardware complete it */
621         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
622                 readl, (sts & DMA_GSTS_RTPS), sts);
623
624         spin_unlock_irqrestore(&iommu->register_lock, flag);
625 }
626
627 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
628 {
629         u32 val;
630         unsigned long flag;
631
632         if (!cap_rwbf(iommu->cap))
633                 return;
634         val = iommu->gcmd | DMA_GCMD_WBF;
635
636         spin_lock_irqsave(&iommu->register_lock, flag);
637         writel(val, iommu->reg + DMAR_GCMD_REG);
638
639         /* Make sure hardware complete it */
640         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
641                         readl, (!(val & DMA_GSTS_WBFS)), val);
642
643         spin_unlock_irqrestore(&iommu->register_lock, flag);
644 }
645
646 /* return value determine if we need a write buffer flush */
647 static int __iommu_flush_context(struct intel_iommu *iommu,
648         u16 did, u16 source_id, u8 function_mask, u64 type,
649         int non_present_entry_flush)
650 {
651         u64 val = 0;
652         unsigned long flag;
653
654         /*
655          * In the non-present entry flush case, if hardware doesn't cache
656          * non-present entry we do nothing and if hardware cache non-present
657          * entry, we flush entries of domain 0 (the domain id is used to cache
658          * any non-present entries)
659          */
660         if (non_present_entry_flush) {
661                 if (!cap_caching_mode(iommu->cap))
662                         return 1;
663                 else
664                         did = 0;
665         }
666
667         switch (type) {
668         case DMA_CCMD_GLOBAL_INVL:
669                 val = DMA_CCMD_GLOBAL_INVL;
670                 break;
671         case DMA_CCMD_DOMAIN_INVL:
672                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
673                 break;
674         case DMA_CCMD_DEVICE_INVL:
675                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
676                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
677                 break;
678         default:
679                 BUG();
680         }
681         val |= DMA_CCMD_ICC;
682
683         spin_lock_irqsave(&iommu->register_lock, flag);
684         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
685
686         /* Make sure hardware complete it */
687         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
688                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
689
690         spin_unlock_irqrestore(&iommu->register_lock, flag);
691
692         /* flush context entry will implicitly flush write buffer */
693         return 0;
694 }
695
696 /* return value determine if we need a write buffer flush */
697 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
698         u64 addr, unsigned int size_order, u64 type,
699         int non_present_entry_flush)
700 {
701         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
702         u64 val = 0, val_iva = 0;
703         unsigned long flag;
704
705         /*
706          * In the non-present entry flush case, if hardware doesn't cache
707          * non-present entry we do nothing and if hardware cache non-present
708          * entry, we flush entries of domain 0 (the domain id is used to cache
709          * any non-present entries)
710          */
711         if (non_present_entry_flush) {
712                 if (!cap_caching_mode(iommu->cap))
713                         return 1;
714                 else
715                         did = 0;
716         }
717
718         switch (type) {
719         case DMA_TLB_GLOBAL_FLUSH:
720                 /* global flush doesn't need set IVA_REG */
721                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
722                 break;
723         case DMA_TLB_DSI_FLUSH:
724                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
725                 break;
726         case DMA_TLB_PSI_FLUSH:
727                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
728                 /* Note: always flush non-leaf currently */
729                 val_iva = size_order | addr;
730                 break;
731         default:
732                 BUG();
733         }
734         /* Note: set drain read/write */
735 #if 0
736         /*
737          * This is probably to be super secure.. Looks like we can
738          * ignore it without any impact.
739          */
740         if (cap_read_drain(iommu->cap))
741                 val |= DMA_TLB_READ_DRAIN;
742 #endif
743         if (cap_write_drain(iommu->cap))
744                 val |= DMA_TLB_WRITE_DRAIN;
745
746         spin_lock_irqsave(&iommu->register_lock, flag);
747         /* Note: Only uses first TLB reg currently */
748         if (val_iva)
749                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
750         dmar_writeq(iommu->reg + tlb_offset + 8, val);
751
752         /* Make sure hardware complete it */
753         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
754                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
755
756         spin_unlock_irqrestore(&iommu->register_lock, flag);
757
758         /* check IOTLB invalidation granularity */
759         if (DMA_TLB_IAIG(val) == 0)
760                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
761         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
762                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
763                         (unsigned long long)DMA_TLB_IIRG(type),
764                         (unsigned long long)DMA_TLB_IAIG(val));
765         /* flush iotlb entry will implicitly flush write buffer */
766         return 0;
767 }
768
769 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
770         u64 addr, unsigned int pages, int non_present_entry_flush)
771 {
772         unsigned int mask;
773
774         BUG_ON(addr & (~VTD_PAGE_MASK));
775         BUG_ON(pages == 0);
776
777         /* Fallback to domain selective flush if no PSI support */
778         if (!cap_pgsel_inv(iommu->cap))
779                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
780                                                 DMA_TLB_DSI_FLUSH,
781                                                 non_present_entry_flush);
782
783         /*
784          * PSI requires page size to be 2 ^ x, and the base address is naturally
785          * aligned to the size
786          */
787         mask = ilog2(__roundup_pow_of_two(pages));
788         /* Fallback to domain selective flush if size is too big */
789         if (mask > cap_max_amask_val(iommu->cap))
790                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
791                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
792
793         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
794                                         DMA_TLB_PSI_FLUSH,
795                                         non_present_entry_flush);
796 }
797
798 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
799 {
800         u32 pmen;
801         unsigned long flags;
802
803         spin_lock_irqsave(&iommu->register_lock, flags);
804         pmen = readl(iommu->reg + DMAR_PMEN_REG);
805         pmen &= ~DMA_PMEN_EPM;
806         writel(pmen, iommu->reg + DMAR_PMEN_REG);
807
808         /* wait for the protected region status bit to clear */
809         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
810                 readl, !(pmen & DMA_PMEN_PRS), pmen);
811
812         spin_unlock_irqrestore(&iommu->register_lock, flags);
813 }
814
815 static int iommu_enable_translation(struct intel_iommu *iommu)
816 {
817         u32 sts;
818         unsigned long flags;
819
820         spin_lock_irqsave(&iommu->register_lock, flags);
821         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
822
823         /* Make sure hardware complete it */
824         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
825                 readl, (sts & DMA_GSTS_TES), sts);
826
827         iommu->gcmd |= DMA_GCMD_TE;
828         spin_unlock_irqrestore(&iommu->register_lock, flags);
829         return 0;
830 }
831
832 static int iommu_disable_translation(struct intel_iommu *iommu)
833 {
834         u32 sts;
835         unsigned long flag;
836
837         spin_lock_irqsave(&iommu->register_lock, flag);
838         iommu->gcmd &= ~DMA_GCMD_TE;
839         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
840
841         /* Make sure hardware complete it */
842         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
843                 readl, (!(sts & DMA_GSTS_TES)), sts);
844
845         spin_unlock_irqrestore(&iommu->register_lock, flag);
846         return 0;
847 }
848
849 /* iommu interrupt handling. Most stuff are MSI-like. */
850
851 static const char *fault_reason_strings[] =
852 {
853         "Software",
854         "Present bit in root entry is clear",
855         "Present bit in context entry is clear",
856         "Invalid context entry",
857         "Access beyond MGAW",
858         "PTE Write access is not set",
859         "PTE Read access is not set",
860         "Next page table ptr is invalid",
861         "Root table address invalid",
862         "Context table ptr is invalid",
863         "non-zero reserved fields in RTP",
864         "non-zero reserved fields in CTP",
865         "non-zero reserved fields in PTE",
866 };
867 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
868
869 const char *dmar_get_fault_reason(u8 fault_reason)
870 {
871         if (fault_reason > MAX_FAULT_REASON_IDX)
872                 return "Unknown";
873         else
874                 return fault_reason_strings[fault_reason];
875 }
876
877 void dmar_msi_unmask(unsigned int irq)
878 {
879         struct intel_iommu *iommu = get_irq_data(irq);
880         unsigned long flag;
881
882         /* unmask it */
883         spin_lock_irqsave(&iommu->register_lock, flag);
884         writel(0, iommu->reg + DMAR_FECTL_REG);
885         /* Read a reg to force flush the post write */
886         readl(iommu->reg + DMAR_FECTL_REG);
887         spin_unlock_irqrestore(&iommu->register_lock, flag);
888 }
889
890 void dmar_msi_mask(unsigned int irq)
891 {
892         unsigned long flag;
893         struct intel_iommu *iommu = get_irq_data(irq);
894
895         /* mask it */
896         spin_lock_irqsave(&iommu->register_lock, flag);
897         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
898         /* Read a reg to force flush the post write */
899         readl(iommu->reg + DMAR_FECTL_REG);
900         spin_unlock_irqrestore(&iommu->register_lock, flag);
901 }
902
903 void dmar_msi_write(int irq, struct msi_msg *msg)
904 {
905         struct intel_iommu *iommu = get_irq_data(irq);
906         unsigned long flag;
907
908         spin_lock_irqsave(&iommu->register_lock, flag);
909         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
910         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
911         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
912         spin_unlock_irqrestore(&iommu->register_lock, flag);
913 }
914
915 void dmar_msi_read(int irq, struct msi_msg *msg)
916 {
917         struct intel_iommu *iommu = get_irq_data(irq);
918         unsigned long flag;
919
920         spin_lock_irqsave(&iommu->register_lock, flag);
921         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
922         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
923         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
924         spin_unlock_irqrestore(&iommu->register_lock, flag);
925 }
926
927 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
928                 u8 fault_reason, u16 source_id, unsigned long long addr)
929 {
930         const char *reason;
931
932         reason = dmar_get_fault_reason(fault_reason);
933
934         printk(KERN_ERR
935                 "DMAR:[%s] Request device [%02x:%02x.%d] "
936                 "fault addr %llx \n"
937                 "DMAR:[fault reason %02d] %s\n",
938                 (type ? "DMA Read" : "DMA Write"),
939                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
940                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
941         return 0;
942 }
943
944 #define PRIMARY_FAULT_REG_LEN (16)
945 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
946 {
947         struct intel_iommu *iommu = dev_id;
948         int reg, fault_index;
949         u32 fault_status;
950         unsigned long flag;
951
952         spin_lock_irqsave(&iommu->register_lock, flag);
953         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
954
955         /* TBD: ignore advanced fault log currently */
956         if (!(fault_status & DMA_FSTS_PPF))
957                 goto clear_overflow;
958
959         fault_index = dma_fsts_fault_record_index(fault_status);
960         reg = cap_fault_reg_offset(iommu->cap);
961         while (1) {
962                 u8 fault_reason;
963                 u16 source_id;
964                 u64 guest_addr;
965                 int type;
966                 u32 data;
967
968                 /* highest 32 bits */
969                 data = readl(iommu->reg + reg +
970                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
971                 if (!(data & DMA_FRCD_F))
972                         break;
973
974                 fault_reason = dma_frcd_fault_reason(data);
975                 type = dma_frcd_type(data);
976
977                 data = readl(iommu->reg + reg +
978                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
979                 source_id = dma_frcd_source_id(data);
980
981                 guest_addr = dmar_readq(iommu->reg + reg +
982                                 fault_index * PRIMARY_FAULT_REG_LEN);
983                 guest_addr = dma_frcd_page_addr(guest_addr);
984                 /* clear the fault */
985                 writel(DMA_FRCD_F, iommu->reg + reg +
986                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
987
988                 spin_unlock_irqrestore(&iommu->register_lock, flag);
989
990                 iommu_page_fault_do_one(iommu, type, fault_reason,
991                                 source_id, guest_addr);
992
993                 fault_index++;
994                 if (fault_index > cap_num_fault_regs(iommu->cap))
995                         fault_index = 0;
996                 spin_lock_irqsave(&iommu->register_lock, flag);
997         }
998 clear_overflow:
999         /* clear primary fault overflow */
1000         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1001         if (fault_status & DMA_FSTS_PFO)
1002                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1003
1004         spin_unlock_irqrestore(&iommu->register_lock, flag);
1005         return IRQ_HANDLED;
1006 }
1007
1008 int dmar_set_interrupt(struct intel_iommu *iommu)
1009 {
1010         int irq, ret;
1011
1012         irq = create_irq();
1013         if (!irq) {
1014                 printk(KERN_ERR "IOMMU: no free vectors\n");
1015                 return -EINVAL;
1016         }
1017
1018         set_irq_data(irq, iommu);
1019         iommu->irq = irq;
1020
1021         ret = arch_setup_dmar_msi(irq);
1022         if (ret) {
1023                 set_irq_data(irq, NULL);
1024                 iommu->irq = 0;
1025                 destroy_irq(irq);
1026                 return 0;
1027         }
1028
1029         /* Force fault register is cleared */
1030         iommu_page_fault(irq, iommu);
1031
1032         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1033         if (ret)
1034                 printk(KERN_ERR "IOMMU: can't request irq\n");
1035         return ret;
1036 }
1037
1038 static int iommu_init_domains(struct intel_iommu *iommu)
1039 {
1040         unsigned long ndomains;
1041         unsigned long nlongs;
1042
1043         ndomains = cap_ndoms(iommu->cap);
1044         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1045         nlongs = BITS_TO_LONGS(ndomains);
1046
1047         /* TBD: there might be 64K domains,
1048          * consider other allocation for future chip
1049          */
1050         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1051         if (!iommu->domain_ids) {
1052                 printk(KERN_ERR "Allocating domain id array failed\n");
1053                 return -ENOMEM;
1054         }
1055         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1056                         GFP_KERNEL);
1057         if (!iommu->domains) {
1058                 printk(KERN_ERR "Allocating domain array failed\n");
1059                 kfree(iommu->domain_ids);
1060                 return -ENOMEM;
1061         }
1062
1063         spin_lock_init(&iommu->lock);
1064
1065         /*
1066          * if Caching mode is set, then invalid translations are tagged
1067          * with domainid 0. Hence we need to pre-allocate it.
1068          */
1069         if (cap_caching_mode(iommu->cap))
1070                 set_bit(0, iommu->domain_ids);
1071         return 0;
1072 }
1073
1074
1075 static void domain_exit(struct dmar_domain *domain);
1076
1077 void free_dmar_iommu(struct intel_iommu *iommu)
1078 {
1079         struct dmar_domain *domain;
1080         int i;
1081
1082         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1083         for (; i < cap_ndoms(iommu->cap); ) {
1084                 domain = iommu->domains[i];
1085                 clear_bit(i, iommu->domain_ids);
1086                 domain_exit(domain);
1087                 i = find_next_bit(iommu->domain_ids,
1088                         cap_ndoms(iommu->cap), i+1);
1089         }
1090
1091         if (iommu->gcmd & DMA_GCMD_TE)
1092                 iommu_disable_translation(iommu);
1093
1094         if (iommu->irq) {
1095                 set_irq_data(iommu->irq, NULL);
1096                 /* This will mask the irq */
1097                 free_irq(iommu->irq, iommu);
1098                 destroy_irq(iommu->irq);
1099         }
1100
1101         kfree(iommu->domains);
1102         kfree(iommu->domain_ids);
1103
1104         /* free context mapping */
1105         free_context_table(iommu);
1106 }
1107
1108 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1109 {
1110         unsigned long num;
1111         unsigned long ndomains;
1112         struct dmar_domain *domain;
1113         unsigned long flags;
1114
1115         domain = alloc_domain_mem();
1116         if (!domain)
1117                 return NULL;
1118
1119         ndomains = cap_ndoms(iommu->cap);
1120
1121         spin_lock_irqsave(&iommu->lock, flags);
1122         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1123         if (num >= ndomains) {
1124                 spin_unlock_irqrestore(&iommu->lock, flags);
1125                 free_domain_mem(domain);
1126                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1127                 return NULL;
1128         }
1129
1130         set_bit(num, iommu->domain_ids);
1131         domain->id = num;
1132         domain->iommu = iommu;
1133         iommu->domains[num] = domain;
1134         spin_unlock_irqrestore(&iommu->lock, flags);
1135
1136         return domain;
1137 }
1138
1139 static void iommu_free_domain(struct dmar_domain *domain)
1140 {
1141         unsigned long flags;
1142
1143         spin_lock_irqsave(&domain->iommu->lock, flags);
1144         clear_bit(domain->id, domain->iommu->domain_ids);
1145         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1146 }
1147
1148 static struct iova_domain reserved_iova_list;
1149 static struct lock_class_key reserved_alloc_key;
1150 static struct lock_class_key reserved_rbtree_key;
1151
1152 static void dmar_init_reserved_ranges(void)
1153 {
1154         struct pci_dev *pdev = NULL;
1155         struct iova *iova;
1156         int i;
1157         u64 addr, size;
1158
1159         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1160
1161         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1162                 &reserved_alloc_key);
1163         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1164                 &reserved_rbtree_key);
1165
1166         /* IOAPIC ranges shouldn't be accessed by DMA */
1167         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1168                 IOVA_PFN(IOAPIC_RANGE_END));
1169         if (!iova)
1170                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1171
1172         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1173         for_each_pci_dev(pdev) {
1174                 struct resource *r;
1175
1176                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1177                         r = &pdev->resource[i];
1178                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1179                                 continue;
1180                         addr = r->start;
1181                         addr &= PAGE_MASK;
1182                         size = r->end - addr;
1183                         size = PAGE_ALIGN(size);
1184                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1185                                 IOVA_PFN(size + addr) - 1);
1186                         if (!iova)
1187                                 printk(KERN_ERR "Reserve iova failed\n");
1188                 }
1189         }
1190
1191 }
1192
1193 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1194 {
1195         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1196 }
1197
1198 static inline int guestwidth_to_adjustwidth(int gaw)
1199 {
1200         int agaw;
1201         int r = (gaw - 12) % 9;
1202
1203         if (r == 0)
1204                 agaw = gaw;
1205         else
1206                 agaw = gaw + 9 - r;
1207         if (agaw > 64)
1208                 agaw = 64;
1209         return agaw;
1210 }
1211
1212 static int domain_init(struct dmar_domain *domain, int guest_width)
1213 {
1214         struct intel_iommu *iommu;
1215         int adjust_width, agaw;
1216         unsigned long sagaw;
1217
1218         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1219         spin_lock_init(&domain->mapping_lock);
1220
1221         domain_reserve_special_ranges(domain);
1222
1223         /* calculate AGAW */
1224         iommu = domain->iommu;
1225         if (guest_width > cap_mgaw(iommu->cap))
1226                 guest_width = cap_mgaw(iommu->cap);
1227         domain->gaw = guest_width;
1228         adjust_width = guestwidth_to_adjustwidth(guest_width);
1229         agaw = width_to_agaw(adjust_width);
1230         sagaw = cap_sagaw(iommu->cap);
1231         if (!test_bit(agaw, &sagaw)) {
1232                 /* hardware doesn't support it, choose a bigger one */
1233                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1234                 agaw = find_next_bit(&sagaw, 5, agaw);
1235                 if (agaw >= 5)
1236                         return -ENODEV;
1237         }
1238         domain->agaw = agaw;
1239         INIT_LIST_HEAD(&domain->devices);
1240
1241         /* always allocate the top pgd */
1242         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1243         if (!domain->pgd)
1244                 return -ENOMEM;
1245         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1246         return 0;
1247 }
1248
1249 static void domain_exit(struct dmar_domain *domain)
1250 {
1251         u64 end;
1252
1253         /* Domain 0 is reserved, so dont process it */
1254         if (!domain)
1255                 return;
1256
1257         domain_remove_dev_info(domain);
1258         /* destroy iovas */
1259         put_iova_domain(&domain->iovad);
1260         end = DOMAIN_MAX_ADDR(domain->gaw);
1261         end = end & (~PAGE_MASK);
1262
1263         /* clear ptes */
1264         dma_pte_clear_range(domain, 0, end);
1265
1266         /* free page tables */
1267         dma_pte_free_pagetable(domain, 0, end);
1268
1269         iommu_free_domain(domain);
1270         free_domain_mem(domain);
1271 }
1272
1273 static int domain_context_mapping_one(struct dmar_domain *domain,
1274                 u8 bus, u8 devfn)
1275 {
1276         struct context_entry *context;
1277         struct intel_iommu *iommu = domain->iommu;
1278         unsigned long flags;
1279
1280         pr_debug("Set context mapping for %02x:%02x.%d\n",
1281                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1282         BUG_ON(!domain->pgd);
1283         context = device_to_context_entry(iommu, bus, devfn);
1284         if (!context)
1285                 return -ENOMEM;
1286         spin_lock_irqsave(&iommu->lock, flags);
1287         if (context_present(*context)) {
1288                 spin_unlock_irqrestore(&iommu->lock, flags);
1289                 return 0;
1290         }
1291
1292         context_set_domain_id(*context, domain->id);
1293         context_set_address_width(*context, domain->agaw);
1294         context_set_address_root(*context, virt_to_phys(domain->pgd));
1295         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1296         context_set_fault_enable(*context);
1297         context_set_present(*context);
1298         __iommu_flush_cache(iommu, context, sizeof(*context));
1299
1300         /* it's a non-present to present mapping */
1301         if (iommu->flush.flush_context(iommu, domain->id,
1302                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1303                 DMA_CCMD_DEVICE_INVL, 1))
1304                 iommu_flush_write_buffer(iommu);
1305         else
1306                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1307
1308         spin_unlock_irqrestore(&iommu->lock, flags);
1309         return 0;
1310 }
1311
1312 static int
1313 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1314 {
1315         int ret;
1316         struct pci_dev *tmp, *parent;
1317
1318         ret = domain_context_mapping_one(domain, pdev->bus->number,
1319                 pdev->devfn);
1320         if (ret)
1321                 return ret;
1322
1323         /* dependent device mapping */
1324         tmp = pci_find_upstream_pcie_bridge(pdev);
1325         if (!tmp)
1326                 return 0;
1327         /* Secondary interface's bus number and devfn 0 */
1328         parent = pdev->bus->self;
1329         while (parent != tmp) {
1330                 ret = domain_context_mapping_one(domain, parent->bus->number,
1331                         parent->devfn);
1332                 if (ret)
1333                         return ret;
1334                 parent = parent->bus->self;
1335         }
1336         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1337                 return domain_context_mapping_one(domain,
1338                         tmp->subordinate->number, 0);
1339         else /* this is a legacy PCI bridge */
1340                 return domain_context_mapping_one(domain,
1341                         tmp->bus->number, tmp->devfn);
1342 }
1343
1344 static int domain_context_mapped(struct dmar_domain *domain,
1345         struct pci_dev *pdev)
1346 {
1347         int ret;
1348         struct pci_dev *tmp, *parent;
1349
1350         ret = device_context_mapped(domain->iommu,
1351                 pdev->bus->number, pdev->devfn);
1352         if (!ret)
1353                 return ret;
1354         /* dependent device mapping */
1355         tmp = pci_find_upstream_pcie_bridge(pdev);
1356         if (!tmp)
1357                 return ret;
1358         /* Secondary interface's bus number and devfn 0 */
1359         parent = pdev->bus->self;
1360         while (parent != tmp) {
1361                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1362                         parent->devfn);
1363                 if (!ret)
1364                         return ret;
1365                 parent = parent->bus->self;
1366         }
1367         if (tmp->is_pcie)
1368                 return device_context_mapped(domain->iommu,
1369                         tmp->subordinate->number, 0);
1370         else
1371                 return device_context_mapped(domain->iommu,
1372                         tmp->bus->number, tmp->devfn);
1373 }
1374
1375 static int
1376 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1377                         u64 hpa, size_t size, int prot)
1378 {
1379         u64 start_pfn, end_pfn;
1380         struct dma_pte *pte;
1381         int index;
1382         int addr_width = agaw_to_width(domain->agaw);
1383
1384         hpa &= (((u64)1) << addr_width) - 1;
1385
1386         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1387                 return -EINVAL;
1388         iova &= PAGE_MASK;
1389         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1390         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1391         index = 0;
1392         while (start_pfn < end_pfn) {
1393                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1394                 if (!pte)
1395                         return -ENOMEM;
1396                 /* We don't need lock here, nobody else
1397                  * touches the iova range
1398                  */
1399                 BUG_ON(dma_pte_addr(*pte));
1400                 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1401                 dma_set_pte_prot(*pte, prot);
1402                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1403                 start_pfn++;
1404                 index++;
1405         }
1406         return 0;
1407 }
1408
1409 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1410 {
1411         clear_context_table(domain->iommu, bus, devfn);
1412         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1413                                            DMA_CCMD_GLOBAL_INVL, 0);
1414         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1415                                          DMA_TLB_GLOBAL_FLUSH, 0);
1416 }
1417
1418 static void domain_remove_dev_info(struct dmar_domain *domain)
1419 {
1420         struct device_domain_info *info;
1421         unsigned long flags;
1422
1423         spin_lock_irqsave(&device_domain_lock, flags);
1424         while (!list_empty(&domain->devices)) {
1425                 info = list_entry(domain->devices.next,
1426                         struct device_domain_info, link);
1427                 list_del(&info->link);
1428                 list_del(&info->global);
1429                 if (info->dev)
1430                         info->dev->dev.archdata.iommu = NULL;
1431                 spin_unlock_irqrestore(&device_domain_lock, flags);
1432
1433                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1434                 free_devinfo_mem(info);
1435
1436                 spin_lock_irqsave(&device_domain_lock, flags);
1437         }
1438         spin_unlock_irqrestore(&device_domain_lock, flags);
1439 }
1440
1441 /*
1442  * find_domain
1443  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1444  */
1445 static struct dmar_domain *
1446 find_domain(struct pci_dev *pdev)
1447 {
1448         struct device_domain_info *info;
1449
1450         /* No lock here, assumes no domain exit in normal case */
1451         info = pdev->dev.archdata.iommu;
1452         if (info)
1453                 return info->domain;
1454         return NULL;
1455 }
1456
1457 /* domain is initialized */
1458 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1459 {
1460         struct dmar_domain *domain, *found = NULL;
1461         struct intel_iommu *iommu;
1462         struct dmar_drhd_unit *drhd;
1463         struct device_domain_info *info, *tmp;
1464         struct pci_dev *dev_tmp;
1465         unsigned long flags;
1466         int bus = 0, devfn = 0;
1467
1468         domain = find_domain(pdev);
1469         if (domain)
1470                 return domain;
1471
1472         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1473         if (dev_tmp) {
1474                 if (dev_tmp->is_pcie) {
1475                         bus = dev_tmp->subordinate->number;
1476                         devfn = 0;
1477                 } else {
1478                         bus = dev_tmp->bus->number;
1479                         devfn = dev_tmp->devfn;
1480                 }
1481                 spin_lock_irqsave(&device_domain_lock, flags);
1482                 list_for_each_entry(info, &device_domain_list, global) {
1483                         if (info->bus == bus && info->devfn == devfn) {
1484                                 found = info->domain;
1485                                 break;
1486                         }
1487                 }
1488                 spin_unlock_irqrestore(&device_domain_lock, flags);
1489                 /* pcie-pci bridge already has a domain, uses it */
1490                 if (found) {
1491                         domain = found;
1492                         goto found_domain;
1493                 }
1494         }
1495
1496         /* Allocate new domain for the device */
1497         drhd = dmar_find_matched_drhd_unit(pdev);
1498         if (!drhd) {
1499                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1500                         pci_name(pdev));
1501                 return NULL;
1502         }
1503         iommu = drhd->iommu;
1504
1505         domain = iommu_alloc_domain(iommu);
1506         if (!domain)
1507                 goto error;
1508
1509         if (domain_init(domain, gaw)) {
1510                 domain_exit(domain);
1511                 goto error;
1512         }
1513
1514         /* register pcie-to-pci device */
1515         if (dev_tmp) {
1516                 info = alloc_devinfo_mem();
1517                 if (!info) {
1518                         domain_exit(domain);
1519                         goto error;
1520                 }
1521                 info->bus = bus;
1522                 info->devfn = devfn;
1523                 info->dev = NULL;
1524                 info->domain = domain;
1525                 /* This domain is shared by devices under p2p bridge */
1526                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1527
1528                 /* pcie-to-pci bridge already has a domain, uses it */
1529                 found = NULL;
1530                 spin_lock_irqsave(&device_domain_lock, flags);
1531                 list_for_each_entry(tmp, &device_domain_list, global) {
1532                         if (tmp->bus == bus && tmp->devfn == devfn) {
1533                                 found = tmp->domain;
1534                                 break;
1535                         }
1536                 }
1537                 if (found) {
1538                         free_devinfo_mem(info);
1539                         domain_exit(domain);
1540                         domain = found;
1541                 } else {
1542                         list_add(&info->link, &domain->devices);
1543                         list_add(&info->global, &device_domain_list);
1544                 }
1545                 spin_unlock_irqrestore(&device_domain_lock, flags);
1546         }
1547
1548 found_domain:
1549         info = alloc_devinfo_mem();
1550         if (!info)
1551                 goto error;
1552         info->bus = pdev->bus->number;
1553         info->devfn = pdev->devfn;
1554         info->dev = pdev;
1555         info->domain = domain;
1556         spin_lock_irqsave(&device_domain_lock, flags);
1557         /* somebody is fast */
1558         found = find_domain(pdev);
1559         if (found != NULL) {
1560                 spin_unlock_irqrestore(&device_domain_lock, flags);
1561                 if (found != domain) {
1562                         domain_exit(domain);
1563                         domain = found;
1564                 }
1565                 free_devinfo_mem(info);
1566                 return domain;
1567         }
1568         list_add(&info->link, &domain->devices);
1569         list_add(&info->global, &device_domain_list);
1570         pdev->dev.archdata.iommu = info;
1571         spin_unlock_irqrestore(&device_domain_lock, flags);
1572         return domain;
1573 error:
1574         /* recheck it here, maybe others set it */
1575         return find_domain(pdev);
1576 }
1577
1578 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1579                                       unsigned long long start,
1580                                       unsigned long long end)
1581 {
1582         struct dmar_domain *domain;
1583         unsigned long size;
1584         unsigned long long base;
1585         int ret;
1586
1587         printk(KERN_INFO
1588                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1589                 pci_name(pdev), start, end);
1590         /* page table init */
1591         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1592         if (!domain)
1593                 return -ENOMEM;
1594
1595         /* The address might not be aligned */
1596         base = start & PAGE_MASK;
1597         size = end - base;
1598         size = PAGE_ALIGN(size);
1599         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1600                         IOVA_PFN(base + size) - 1)) {
1601                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1602                 ret = -ENOMEM;
1603                 goto error;
1604         }
1605
1606         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1607                 size, base, pci_name(pdev));
1608         /*
1609          * RMRR range might have overlap with physical memory range,
1610          * clear it first
1611          */
1612         dma_pte_clear_range(domain, base, base + size);
1613
1614         ret = domain_page_mapping(domain, base, base, size,
1615                 DMA_PTE_READ|DMA_PTE_WRITE);
1616         if (ret)
1617                 goto error;
1618
1619         /* context entry init */
1620         ret = domain_context_mapping(domain, pdev);
1621         if (!ret)
1622                 return 0;
1623 error:
1624         domain_exit(domain);
1625         return ret;
1626
1627 }
1628
1629 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1630         struct pci_dev *pdev)
1631 {
1632         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1633                 return 0;
1634         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1635                 rmrr->end_address + 1);
1636 }
1637
1638 #ifdef CONFIG_DMAR_GFX_WA
1639 struct iommu_prepare_data {
1640         struct pci_dev *pdev;
1641         int ret;
1642 };
1643
1644 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1645                                          unsigned long end_pfn, void *datax)
1646 {
1647         struct iommu_prepare_data *data;
1648
1649         data = (struct iommu_prepare_data *)datax;
1650
1651         data->ret = iommu_prepare_identity_map(data->pdev,
1652                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1653         return data->ret;
1654
1655 }
1656
1657 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1658 {
1659         int nid;
1660         struct iommu_prepare_data data;
1661
1662         data.pdev = pdev;
1663         data.ret = 0;
1664
1665         for_each_online_node(nid) {
1666                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1667                 if (data.ret)
1668                         return data.ret;
1669         }
1670         return data.ret;
1671 }
1672
1673 static void __init iommu_prepare_gfx_mapping(void)
1674 {
1675         struct pci_dev *pdev = NULL;
1676         int ret;
1677
1678         for_each_pci_dev(pdev) {
1679                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1680                                 !IS_GFX_DEVICE(pdev))
1681                         continue;
1682                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1683                         pci_name(pdev));
1684                 ret = iommu_prepare_with_active_regions(pdev);
1685                 if (ret)
1686                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1687         }
1688 }
1689 #else /* !CONFIG_DMAR_GFX_WA */
1690 static inline void iommu_prepare_gfx_mapping(void)
1691 {
1692         return;
1693 }
1694 #endif
1695
1696 #ifdef CONFIG_DMAR_FLOPPY_WA
1697 static inline void iommu_prepare_isa(void)
1698 {
1699         struct pci_dev *pdev;
1700         int ret;
1701
1702         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1703         if (!pdev)
1704                 return;
1705
1706         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1707         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1708
1709         if (ret)
1710                 printk("IOMMU: Failed to create 0-64M identity map, "
1711                         "floppy might not work\n");
1712
1713 }
1714 #else
1715 static inline void iommu_prepare_isa(void)
1716 {
1717         return;
1718 }
1719 #endif /* !CONFIG_DMAR_FLPY_WA */
1720
1721 static int __init init_dmars(void)
1722 {
1723         struct dmar_drhd_unit *drhd;
1724         struct dmar_rmrr_unit *rmrr;
1725         struct pci_dev *pdev;
1726         struct intel_iommu *iommu;
1727         int i, ret, unit = 0;
1728
1729         /*
1730          * for each drhd
1731          *    allocate root
1732          *    initialize and program root entry to not present
1733          * endfor
1734          */
1735         for_each_drhd_unit(drhd) {
1736                 g_num_of_iommus++;
1737                 /*
1738                  * lock not needed as this is only incremented in the single
1739                  * threaded kernel __init code path all other access are read
1740                  * only
1741                  */
1742         }
1743
1744         deferred_flush = kzalloc(g_num_of_iommus *
1745                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1746         if (!deferred_flush) {
1747                 ret = -ENOMEM;
1748                 goto error;
1749         }
1750
1751         for_each_drhd_unit(drhd) {
1752                 if (drhd->ignored)
1753                         continue;
1754
1755                 iommu = drhd->iommu;
1756
1757                 ret = iommu_init_domains(iommu);
1758                 if (ret)
1759                         goto error;
1760
1761                 /*
1762                  * TBD:
1763                  * we could share the same root & context tables
1764                  * amoung all IOMMU's. Need to Split it later.
1765                  */
1766                 ret = iommu_alloc_root_entry(iommu);
1767                 if (ret) {
1768                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1769                         goto error;
1770                 }
1771         }
1772
1773         for_each_drhd_unit(drhd) {
1774                 if (drhd->ignored)
1775                         continue;
1776
1777                 iommu = drhd->iommu;
1778                 if (dmar_enable_qi(iommu)) {
1779                         /*
1780                          * Queued Invalidate not enabled, use Register Based
1781                          * Invalidate
1782                          */
1783                         iommu->flush.flush_context = __iommu_flush_context;
1784                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1785                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1786                                "invalidation\n",
1787                                (unsigned long long)drhd->reg_base_addr);
1788                 } else {
1789                         iommu->flush.flush_context = qi_flush_context;
1790                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1791                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1792                                "invalidation\n",
1793                                (unsigned long long)drhd->reg_base_addr);
1794                 }
1795         }
1796
1797         /*
1798          * For each rmrr
1799          *   for each dev attached to rmrr
1800          *   do
1801          *     locate drhd for dev, alloc domain for dev
1802          *     allocate free domain
1803          *     allocate page table entries for rmrr
1804          *     if context not allocated for bus
1805          *           allocate and init context
1806          *           set present in root table for this bus
1807          *     init context with domain, translation etc
1808          *    endfor
1809          * endfor
1810          */
1811         for_each_rmrr_units(rmrr) {
1812                 for (i = 0; i < rmrr->devices_cnt; i++) {
1813                         pdev = rmrr->devices[i];
1814                         /* some BIOS lists non-exist devices in DMAR table */
1815                         if (!pdev)
1816                                 continue;
1817                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1818                         if (ret)
1819                                 printk(KERN_ERR
1820                                  "IOMMU: mapping reserved region failed\n");
1821                 }
1822         }
1823
1824         iommu_prepare_gfx_mapping();
1825
1826         iommu_prepare_isa();
1827
1828         /*
1829          * for each drhd
1830          *   enable fault log
1831          *   global invalidate context cache
1832          *   global invalidate iotlb
1833          *   enable translation
1834          */
1835         for_each_drhd_unit(drhd) {
1836                 if (drhd->ignored)
1837                         continue;
1838                 iommu = drhd->iommu;
1839                 sprintf (iommu->name, "dmar%d", unit++);
1840
1841                 iommu_flush_write_buffer(iommu);
1842
1843                 ret = dmar_set_interrupt(iommu);
1844                 if (ret)
1845                         goto error;
1846
1847                 iommu_set_root_entry(iommu);
1848
1849                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1850                                            0);
1851                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1852                                          0);
1853                 iommu_disable_protect_mem_regions(iommu);
1854
1855                 ret = iommu_enable_translation(iommu);
1856                 if (ret)
1857                         goto error;
1858         }
1859
1860         return 0;
1861 error:
1862         for_each_drhd_unit(drhd) {
1863                 if (drhd->ignored)
1864                         continue;
1865                 iommu = drhd->iommu;
1866                 free_iommu(iommu);
1867         }
1868         return ret;
1869 }
1870
1871 static inline u64 aligned_size(u64 host_addr, size_t size)
1872 {
1873         u64 addr;
1874         addr = (host_addr & (~PAGE_MASK)) + size;
1875         return PAGE_ALIGN(addr);
1876 }
1877
1878 struct iova *
1879 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1880 {
1881         struct iova *piova;
1882
1883         /* Make sure it's in range */
1884         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1885         if (!size || (IOVA_START_ADDR + size > end))
1886                 return NULL;
1887
1888         piova = alloc_iova(&domain->iovad,
1889                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1890         return piova;
1891 }
1892
1893 static struct iova *
1894 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1895                    size_t size, u64 dma_mask)
1896 {
1897         struct pci_dev *pdev = to_pci_dev(dev);
1898         struct iova *iova = NULL;
1899
1900         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1901                 iova = iommu_alloc_iova(domain, size, dma_mask);
1902         else {
1903                 /*
1904                  * First try to allocate an io virtual address in
1905                  * DMA_32BIT_MASK and if that fails then try allocating
1906                  * from higher range
1907                  */
1908                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1909                 if (!iova)
1910                         iova = iommu_alloc_iova(domain, size, dma_mask);
1911         }
1912
1913         if (!iova) {
1914                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1915                 return NULL;
1916         }
1917
1918         return iova;
1919 }
1920
1921 static struct dmar_domain *
1922 get_valid_domain_for_dev(struct pci_dev *pdev)
1923 {
1924         struct dmar_domain *domain;
1925         int ret;
1926
1927         domain = get_domain_for_dev(pdev,
1928                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1929         if (!domain) {
1930                 printk(KERN_ERR
1931                         "Allocating domain for %s failed", pci_name(pdev));
1932                 return NULL;
1933         }
1934
1935         /* make sure context mapping is ok */
1936         if (unlikely(!domain_context_mapped(domain, pdev))) {
1937                 ret = domain_context_mapping(domain, pdev);
1938                 if (ret) {
1939                         printk(KERN_ERR
1940                                 "Domain context map for %s failed",
1941                                 pci_name(pdev));
1942                         return NULL;
1943                 }
1944         }
1945
1946         return domain;
1947 }
1948
1949 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1950                                      size_t size, int dir, u64 dma_mask)
1951 {
1952         struct pci_dev *pdev = to_pci_dev(hwdev);
1953         struct dmar_domain *domain;
1954         phys_addr_t start_paddr;
1955         struct iova *iova;
1956         int prot = 0;
1957         int ret;
1958
1959         BUG_ON(dir == DMA_NONE);
1960         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1961                 return paddr;
1962
1963         domain = get_valid_domain_for_dev(pdev);
1964         if (!domain)
1965                 return 0;
1966
1967         size = aligned_size((u64)paddr, size);
1968
1969         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1970         if (!iova)
1971                 goto error;
1972
1973         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1974
1975         /*
1976          * Check if DMAR supports zero-length reads on write only
1977          * mappings..
1978          */
1979         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1980                         !cap_zlr(domain->iommu->cap))
1981                 prot |= DMA_PTE_READ;
1982         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1983                 prot |= DMA_PTE_WRITE;
1984         /*
1985          * paddr - (paddr + size) might be partial page, we should map the whole
1986          * page.  Note: if two part of one page are separately mapped, we
1987          * might have two guest_addr mapping to the same host paddr, but this
1988          * is not a big problem
1989          */
1990         ret = domain_page_mapping(domain, start_paddr,
1991                 ((u64)paddr) & PAGE_MASK, size, prot);
1992         if (ret)
1993                 goto error;
1994
1995         /* it's a non-present to present mapping */
1996         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1997                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
1998         if (ret)
1999                 iommu_flush_write_buffer(domain->iommu);
2000
2001         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2002
2003 error:
2004         if (iova)
2005                 __free_iova(&domain->iovad, iova);
2006         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2007                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2008         return 0;
2009 }
2010
2011 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2012                             size_t size, int dir)
2013 {
2014         return __intel_map_single(hwdev, paddr, size, dir,
2015                                   to_pci_dev(hwdev)->dma_mask);
2016 }
2017
2018 static void flush_unmaps(void)
2019 {
2020         int i, j;
2021
2022         timer_on = 0;
2023
2024         /* just flush them all */
2025         for (i = 0; i < g_num_of_iommus; i++) {
2026                 if (deferred_flush[i].next) {
2027                         struct intel_iommu *iommu =
2028                                 deferred_flush[i].domain[0]->iommu;
2029
2030                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2031                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2032                         for (j = 0; j < deferred_flush[i].next; j++) {
2033                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2034                                                 deferred_flush[i].iova[j]);
2035                         }
2036                         deferred_flush[i].next = 0;
2037                 }
2038         }
2039
2040         list_size = 0;
2041 }
2042
2043 static void flush_unmaps_timeout(unsigned long data)
2044 {
2045         unsigned long flags;
2046
2047         spin_lock_irqsave(&async_umap_flush_lock, flags);
2048         flush_unmaps();
2049         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2050 }
2051
2052 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2053 {
2054         unsigned long flags;
2055         int next, iommu_id;
2056
2057         spin_lock_irqsave(&async_umap_flush_lock, flags);
2058         if (list_size == HIGH_WATER_MARK)
2059                 flush_unmaps();
2060
2061         iommu_id = dom->iommu->seq_id;
2062
2063         next = deferred_flush[iommu_id].next;
2064         deferred_flush[iommu_id].domain[next] = dom;
2065         deferred_flush[iommu_id].iova[next] = iova;
2066         deferred_flush[iommu_id].next++;
2067
2068         if (!timer_on) {
2069                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2070                 timer_on = 1;
2071         }
2072         list_size++;
2073         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2074 }
2075
2076 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2077                         int dir)
2078 {
2079         struct pci_dev *pdev = to_pci_dev(dev);
2080         struct dmar_domain *domain;
2081         unsigned long start_addr;
2082         struct iova *iova;
2083
2084         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2085                 return;
2086         domain = find_domain(pdev);
2087         BUG_ON(!domain);
2088
2089         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2090         if (!iova)
2091                 return;
2092
2093         start_addr = iova->pfn_lo << PAGE_SHIFT;
2094         size = aligned_size((u64)dev_addr, size);
2095
2096         pr_debug("Device %s unmapping: %lx@%llx\n",
2097                 pci_name(pdev), size, (unsigned long long)start_addr);
2098
2099         /*  clear the whole page */
2100         dma_pte_clear_range(domain, start_addr, start_addr + size);
2101         /* free page tables */
2102         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2103         if (intel_iommu_strict) {
2104                 if (iommu_flush_iotlb_psi(domain->iommu,
2105                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2106                         iommu_flush_write_buffer(domain->iommu);
2107                 /* free iova */
2108                 __free_iova(&domain->iovad, iova);
2109         } else {
2110                 add_unmap(domain, iova);
2111                 /*
2112                  * queue up the release of the unmap to save the 1/6th of the
2113                  * cpu used up by the iotlb flush operation...
2114                  */
2115         }
2116 }
2117
2118 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2119                            dma_addr_t *dma_handle, gfp_t flags)
2120 {
2121         void *vaddr;
2122         int order;
2123
2124         size = PAGE_ALIGN(size);
2125         order = get_order(size);
2126         flags &= ~(GFP_DMA | GFP_DMA32);
2127
2128         vaddr = (void *)__get_free_pages(flags, order);
2129         if (!vaddr)
2130                 return NULL;
2131         memset(vaddr, 0, size);
2132
2133         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2134                                          DMA_BIDIRECTIONAL,
2135                                          hwdev->coherent_dma_mask);
2136         if (*dma_handle)
2137                 return vaddr;
2138         free_pages((unsigned long)vaddr, order);
2139         return NULL;
2140 }
2141
2142 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2143                          dma_addr_t dma_handle)
2144 {
2145         int order;
2146
2147         size = PAGE_ALIGN(size);
2148         order = get_order(size);
2149
2150         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2151         free_pages((unsigned long)vaddr, order);
2152 }
2153
2154 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2155
2156 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2157                     int nelems, int dir)
2158 {
2159         int i;
2160         struct pci_dev *pdev = to_pci_dev(hwdev);
2161         struct dmar_domain *domain;
2162         unsigned long start_addr;
2163         struct iova *iova;
2164         size_t size = 0;
2165         void *addr;
2166         struct scatterlist *sg;
2167
2168         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2169                 return;
2170
2171         domain = find_domain(pdev);
2172
2173         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2174         if (!iova)
2175                 return;
2176         for_each_sg(sglist, sg, nelems, i) {
2177                 addr = SG_ENT_VIRT_ADDRESS(sg);
2178                 size += aligned_size((u64)addr, sg->length);
2179         }
2180
2181         start_addr = iova->pfn_lo << PAGE_SHIFT;
2182
2183         /*  clear the whole page */
2184         dma_pte_clear_range(domain, start_addr, start_addr + size);
2185         /* free page tables */
2186         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2187
2188         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2189                         size >> VTD_PAGE_SHIFT, 0))
2190                 iommu_flush_write_buffer(domain->iommu);
2191
2192         /* free iova */
2193         __free_iova(&domain->iovad, iova);
2194 }
2195
2196 static int intel_nontranslate_map_sg(struct device *hddev,
2197         struct scatterlist *sglist, int nelems, int dir)
2198 {
2199         int i;
2200         struct scatterlist *sg;
2201
2202         for_each_sg(sglist, sg, nelems, i) {
2203                 BUG_ON(!sg_page(sg));
2204                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2205                 sg->dma_length = sg->length;
2206         }
2207         return nelems;
2208 }
2209
2210 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2211                  int dir)
2212 {
2213         void *addr;
2214         int i;
2215         struct pci_dev *pdev = to_pci_dev(hwdev);
2216         struct dmar_domain *domain;
2217         size_t size = 0;
2218         int prot = 0;
2219         size_t offset = 0;
2220         struct iova *iova = NULL;
2221         int ret;
2222         struct scatterlist *sg;
2223         unsigned long start_addr;
2224
2225         BUG_ON(dir == DMA_NONE);
2226         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2227                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2228
2229         domain = get_valid_domain_for_dev(pdev);
2230         if (!domain)
2231                 return 0;
2232
2233         for_each_sg(sglist, sg, nelems, i) {
2234                 addr = SG_ENT_VIRT_ADDRESS(sg);
2235                 addr = (void *)virt_to_phys(addr);
2236                 size += aligned_size((u64)addr, sg->length);
2237         }
2238
2239         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2240         if (!iova) {
2241                 sglist->dma_length = 0;
2242                 return 0;
2243         }
2244
2245         /*
2246          * Check if DMAR supports zero-length reads on write only
2247          * mappings..
2248          */
2249         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2250                         !cap_zlr(domain->iommu->cap))
2251                 prot |= DMA_PTE_READ;
2252         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2253                 prot |= DMA_PTE_WRITE;
2254
2255         start_addr = iova->pfn_lo << PAGE_SHIFT;
2256         offset = 0;
2257         for_each_sg(sglist, sg, nelems, i) {
2258                 addr = SG_ENT_VIRT_ADDRESS(sg);
2259                 addr = (void *)virt_to_phys(addr);
2260                 size = aligned_size((u64)addr, sg->length);
2261                 ret = domain_page_mapping(domain, start_addr + offset,
2262                         ((u64)addr) & PAGE_MASK,
2263                         size, prot);
2264                 if (ret) {
2265                         /*  clear the page */
2266                         dma_pte_clear_range(domain, start_addr,
2267                                   start_addr + offset);
2268                         /* free page tables */
2269                         dma_pte_free_pagetable(domain, start_addr,
2270                                   start_addr + offset);
2271                         /* free iova */
2272                         __free_iova(&domain->iovad, iova);
2273                         return 0;
2274                 }
2275                 sg->dma_address = start_addr + offset +
2276                                 ((u64)addr & (~PAGE_MASK));
2277                 sg->dma_length = sg->length;
2278                 offset += size;
2279         }
2280
2281         /* it's a non-present to present mapping */
2282         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2283                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2284                 iommu_flush_write_buffer(domain->iommu);
2285         return nelems;
2286 }
2287
2288 static struct dma_mapping_ops intel_dma_ops = {
2289         .alloc_coherent = intel_alloc_coherent,
2290         .free_coherent = intel_free_coherent,
2291         .map_single = intel_map_single,
2292         .unmap_single = intel_unmap_single,
2293         .map_sg = intel_map_sg,
2294         .unmap_sg = intel_unmap_sg,
2295 };
2296
2297 static inline int iommu_domain_cache_init(void)
2298 {
2299         int ret = 0;
2300
2301         iommu_domain_cache = kmem_cache_create("iommu_domain",
2302                                          sizeof(struct dmar_domain),
2303                                          0,
2304                                          SLAB_HWCACHE_ALIGN,
2305
2306                                          NULL);
2307         if (!iommu_domain_cache) {
2308                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2309                 ret = -ENOMEM;
2310         }
2311
2312         return ret;
2313 }
2314
2315 static inline int iommu_devinfo_cache_init(void)
2316 {
2317         int ret = 0;
2318
2319         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2320                                          sizeof(struct device_domain_info),
2321                                          0,
2322                                          SLAB_HWCACHE_ALIGN,
2323                                          NULL);
2324         if (!iommu_devinfo_cache) {
2325                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2326                 ret = -ENOMEM;
2327         }
2328
2329         return ret;
2330 }
2331
2332 static inline int iommu_iova_cache_init(void)
2333 {
2334         int ret = 0;
2335
2336         iommu_iova_cache = kmem_cache_create("iommu_iova",
2337                                          sizeof(struct iova),
2338                                          0,
2339                                          SLAB_HWCACHE_ALIGN,
2340                                          NULL);
2341         if (!iommu_iova_cache) {
2342                 printk(KERN_ERR "Couldn't create iova cache\n");
2343                 ret = -ENOMEM;
2344         }
2345
2346         return ret;
2347 }
2348
2349 static int __init iommu_init_mempool(void)
2350 {
2351         int ret;
2352         ret = iommu_iova_cache_init();
2353         if (ret)
2354                 return ret;
2355
2356         ret = iommu_domain_cache_init();
2357         if (ret)
2358                 goto domain_error;
2359
2360         ret = iommu_devinfo_cache_init();
2361         if (!ret)
2362                 return ret;
2363
2364         kmem_cache_destroy(iommu_domain_cache);
2365 domain_error:
2366         kmem_cache_destroy(iommu_iova_cache);
2367
2368         return -ENOMEM;
2369 }
2370
2371 static void __init iommu_exit_mempool(void)
2372 {
2373         kmem_cache_destroy(iommu_devinfo_cache);
2374         kmem_cache_destroy(iommu_domain_cache);
2375         kmem_cache_destroy(iommu_iova_cache);
2376
2377 }
2378
2379 static void __init init_no_remapping_devices(void)
2380 {
2381         struct dmar_drhd_unit *drhd;
2382
2383         for_each_drhd_unit(drhd) {
2384                 if (!drhd->include_all) {
2385                         int i;
2386                         for (i = 0; i < drhd->devices_cnt; i++)
2387                                 if (drhd->devices[i] != NULL)
2388                                         break;
2389                         /* ignore DMAR unit if no pci devices exist */
2390                         if (i == drhd->devices_cnt)
2391                                 drhd->ignored = 1;
2392                 }
2393         }
2394
2395         if (dmar_map_gfx)
2396                 return;
2397
2398         for_each_drhd_unit(drhd) {
2399                 int i;
2400                 if (drhd->ignored || drhd->include_all)
2401                         continue;
2402
2403                 for (i = 0; i < drhd->devices_cnt; i++)
2404                         if (drhd->devices[i] &&
2405                                 !IS_GFX_DEVICE(drhd->devices[i]))
2406                                 break;
2407
2408                 if (i < drhd->devices_cnt)
2409                         continue;
2410
2411                 /* bypass IOMMU if it is just for gfx devices */
2412                 drhd->ignored = 1;
2413                 for (i = 0; i < drhd->devices_cnt; i++) {
2414                         if (!drhd->devices[i])
2415                                 continue;
2416                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2417                 }
2418         }
2419 }
2420
2421 int __init intel_iommu_init(void)
2422 {
2423         int ret = 0;
2424
2425         if (dmar_table_init())
2426                 return  -ENODEV;
2427
2428         if (dmar_dev_scope_init())
2429                 return  -ENODEV;
2430
2431         /*
2432          * Check the need for DMA-remapping initialization now.
2433          * Above initialization will also be used by Interrupt-remapping.
2434          */
2435         if (no_iommu || swiotlb || dmar_disabled)
2436                 return -ENODEV;
2437
2438         iommu_init_mempool();
2439         dmar_init_reserved_ranges();
2440
2441         init_no_remapping_devices();
2442
2443         ret = init_dmars();
2444         if (ret) {
2445                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2446                 put_iova_domain(&reserved_iova_list);
2447                 iommu_exit_mempool();
2448                 return ret;
2449         }
2450         printk(KERN_INFO
2451         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2452
2453         init_timer(&unmap_timer);
2454         force_iommu = 1;
2455         dma_ops = &intel_dma_ops;
2456         return 0;
2457 }
2458
2459 void intel_iommu_domain_exit(struct dmar_domain *domain)
2460 {
2461         u64 end;
2462
2463         /* Domain 0 is reserved, so dont process it */
2464         if (!domain)
2465                 return;
2466
2467         end = DOMAIN_MAX_ADDR(domain->gaw);
2468         end = end & (~VTD_PAGE_MASK);
2469
2470         /* clear ptes */
2471         dma_pte_clear_range(domain, 0, end);
2472
2473         /* free page tables */
2474         dma_pte_free_pagetable(domain, 0, end);
2475
2476         iommu_free_domain(domain);
2477         free_domain_mem(domain);
2478 }
2479 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2480
2481 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2482 {
2483         struct dmar_drhd_unit *drhd;
2484         struct dmar_domain *domain;
2485         struct intel_iommu *iommu;
2486
2487         drhd = dmar_find_matched_drhd_unit(pdev);
2488         if (!drhd) {
2489                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2490                 return NULL;
2491         }
2492
2493         iommu = drhd->iommu;
2494         if (!iommu) {
2495                 printk(KERN_ERR
2496                         "intel_iommu_domain_alloc: iommu == NULL\n");
2497                 return NULL;
2498         }
2499         domain = iommu_alloc_domain(iommu);
2500         if (!domain) {
2501                 printk(KERN_ERR
2502                         "intel_iommu_domain_alloc: domain == NULL\n");
2503                 return NULL;
2504         }
2505         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2506                 printk(KERN_ERR
2507                         "intel_iommu_domain_alloc: domain_init() failed\n");
2508                 intel_iommu_domain_exit(domain);
2509                 return NULL;
2510         }
2511         return domain;
2512 }
2513 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2514
2515 int intel_iommu_context_mapping(
2516         struct dmar_domain *domain, struct pci_dev *pdev)
2517 {
2518         int rc;
2519         rc = domain_context_mapping(domain, pdev);
2520         return rc;
2521 }
2522 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2523
2524 int intel_iommu_page_mapping(
2525         struct dmar_domain *domain, dma_addr_t iova,
2526         u64 hpa, size_t size, int prot)
2527 {
2528         int rc;
2529         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2530         return rc;
2531 }
2532 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2533
2534 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2535 {
2536         detach_domain_for_dev(domain, bus, devfn);
2537 }
2538 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2539
2540 struct dmar_domain *
2541 intel_iommu_find_domain(struct pci_dev *pdev)
2542 {
2543         return find_domain(pdev);
2544 }
2545 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2546
2547 int intel_iommu_found(void)
2548 {
2549         return g_num_of_iommus;
2550 }
2551 EXPORT_SYMBOL_GPL(intel_iommu_found);
2552
2553 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2554 {
2555         struct dma_pte *pte;
2556         u64 pfn;
2557
2558         pfn = 0;
2559         pte = addr_to_dma_pte(domain, iova);
2560
2561         if (pte)
2562                 pfn = dma_pte_addr(*pte);
2563
2564         return pfn >> VTD_PAGE_SHIFT;
2565 }
2566 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);