intel-iommu: move struct dmar_domain def out dma_remapping.h
[firefly-linux-kernel-4.4.55.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /*
61  * 0: Present
62  * 1-11: Reserved
63  * 12-63: Context Ptr (12 - (haw-1))
64  * 64-127: Reserved
65  */
66 struct root_entry {
67         u64     val;
68         u64     rsvd1;
69 };
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
72 {
73         return (root->val & 1);
74 }
75 static inline void set_root_present(struct root_entry *root)
76 {
77         root->val |= 1;
78 }
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
80 {
81         root->val |= value & VTD_PAGE_MASK;
82 }
83
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
86 {
87         return (struct context_entry *)
88                 (root_present(root)?phys_to_virt(
89                 root->val & VTD_PAGE_MASK) :
90                 NULL);
91 }
92
93 /*
94  * low 64 bits:
95  * 0: present
96  * 1: fault processing disable
97  * 2-3: translation type
98  * 12-63: address space root
99  * high 64 bits:
100  * 0-2: address width
101  * 3-6: aval
102  * 8-23: domain id
103  */
104 struct context_entry {
105         u64 lo;
106         u64 hi;
107 };
108 #define context_present(c) ((c).lo & 1)
109 #define context_fault_disable(c) (((c).lo >> 1) & 1)
110 #define context_translation_type(c) (((c).lo >> 2) & 3)
111 #define context_address_root(c) ((c).lo & VTD_PAGE_MASK)
112 #define context_address_width(c) ((c).hi &  7)
113 #define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
114
115 #define context_set_present(c) do {(c).lo |= 1;} while (0)
116 #define context_set_fault_enable(c) \
117         do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
118 #define context_set_translation_type(c, val) \
119         do { \
120                 (c).lo &= (((u64)-1) << 4) | 3; \
121                 (c).lo |= ((val) & 3) << 2; \
122         } while (0)
123 #define CONTEXT_TT_MULTI_LEVEL 0
124 #define context_set_address_root(c, val) \
125         do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0)
126 #define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
127 #define context_set_domain_id(c, val) \
128         do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
129 #define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
130
131 /*
132  * 0: readable
133  * 1: writable
134  * 2-6: reserved
135  * 7: super page
136  * 8-11: available
137  * 12-63: Host physcial address
138  */
139 struct dma_pte {
140         u64 val;
141 };
142 #define dma_clear_pte(p)        do {(p).val = 0;} while (0)
143
144 #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
145 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
146 #define dma_set_pte_prot(p, prot) \
147                 do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
148 #define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK)
149 #define dma_set_pte_addr(p, addr) do {\
150                 (p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
151 #define dma_pte_present(p) (((p).val & 3) != 0)
152
153 struct dmar_domain {
154         int     id;                     /* domain id */
155         struct intel_iommu *iommu;      /* back pointer to owning iommu */
156
157         struct list_head devices;       /* all devices' list */
158         struct iova_domain iovad;       /* iova's that belong to this domain */
159
160         struct dma_pte  *pgd;           /* virtual address */
161         spinlock_t      mapping_lock;   /* page table lock */
162         int             gaw;            /* max guest address width */
163
164         /* adjusted guest address width, 0 is level 2 30-bit */
165         int             agaw;
166
167 #define DOMAIN_FLAG_MULTIPLE_DEVICES 1
168         int             flags;
169 };
170
171 static void flush_unmaps_timeout(unsigned long data);
172
173 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
174
175 #define HIGH_WATER_MARK 250
176 struct deferred_flush_tables {
177         int next;
178         struct iova *iova[HIGH_WATER_MARK];
179         struct dmar_domain *domain[HIGH_WATER_MARK];
180 };
181
182 static struct deferred_flush_tables *deferred_flush;
183
184 /* bitmap for indexing intel_iommus */
185 static int g_num_of_iommus;
186
187 static DEFINE_SPINLOCK(async_umap_flush_lock);
188 static LIST_HEAD(unmaps_to_do);
189
190 static int timer_on;
191 static long list_size;
192
193 static void domain_remove_dev_info(struct dmar_domain *domain);
194
195 int dmar_disabled;
196 static int __initdata dmar_map_gfx = 1;
197 static int dmar_forcedac;
198 static int intel_iommu_strict;
199
200 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
201 static DEFINE_SPINLOCK(device_domain_lock);
202 static LIST_HEAD(device_domain_list);
203
204 static int __init intel_iommu_setup(char *str)
205 {
206         if (!str)
207                 return -EINVAL;
208         while (*str) {
209                 if (!strncmp(str, "off", 3)) {
210                         dmar_disabled = 1;
211                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
212                 } else if (!strncmp(str, "igfx_off", 8)) {
213                         dmar_map_gfx = 0;
214                         printk(KERN_INFO
215                                 "Intel-IOMMU: disable GFX device mapping\n");
216                 } else if (!strncmp(str, "forcedac", 8)) {
217                         printk(KERN_INFO
218                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
219                         dmar_forcedac = 1;
220                 } else if (!strncmp(str, "strict", 6)) {
221                         printk(KERN_INFO
222                                 "Intel-IOMMU: disable batched IOTLB flush\n");
223                         intel_iommu_strict = 1;
224                 }
225
226                 str += strcspn(str, ",");
227                 while (*str == ',')
228                         str++;
229         }
230         return 0;
231 }
232 __setup("intel_iommu=", intel_iommu_setup);
233
234 static struct kmem_cache *iommu_domain_cache;
235 static struct kmem_cache *iommu_devinfo_cache;
236 static struct kmem_cache *iommu_iova_cache;
237
238 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
239 {
240         unsigned int flags;
241         void *vaddr;
242
243         /* trying to avoid low memory issues */
244         flags = current->flags & PF_MEMALLOC;
245         current->flags |= PF_MEMALLOC;
246         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
247         current->flags &= (~PF_MEMALLOC | flags);
248         return vaddr;
249 }
250
251
252 static inline void *alloc_pgtable_page(void)
253 {
254         unsigned int flags;
255         void *vaddr;
256
257         /* trying to avoid low memory issues */
258         flags = current->flags & PF_MEMALLOC;
259         current->flags |= PF_MEMALLOC;
260         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
261         current->flags &= (~PF_MEMALLOC | flags);
262         return vaddr;
263 }
264
265 static inline void free_pgtable_page(void *vaddr)
266 {
267         free_page((unsigned long)vaddr);
268 }
269
270 static inline void *alloc_domain_mem(void)
271 {
272         return iommu_kmem_cache_alloc(iommu_domain_cache);
273 }
274
275 static void free_domain_mem(void *vaddr)
276 {
277         kmem_cache_free(iommu_domain_cache, vaddr);
278 }
279
280 static inline void * alloc_devinfo_mem(void)
281 {
282         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
283 }
284
285 static inline void free_devinfo_mem(void *vaddr)
286 {
287         kmem_cache_free(iommu_devinfo_cache, vaddr);
288 }
289
290 struct iova *alloc_iova_mem(void)
291 {
292         return iommu_kmem_cache_alloc(iommu_iova_cache);
293 }
294
295 void free_iova_mem(struct iova *iova)
296 {
297         kmem_cache_free(iommu_iova_cache, iova);
298 }
299
300 /* Gets context entry for a given bus and devfn */
301 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
302                 u8 bus, u8 devfn)
303 {
304         struct root_entry *root;
305         struct context_entry *context;
306         unsigned long phy_addr;
307         unsigned long flags;
308
309         spin_lock_irqsave(&iommu->lock, flags);
310         root = &iommu->root_entry[bus];
311         context = get_context_addr_from_root(root);
312         if (!context) {
313                 context = (struct context_entry *)alloc_pgtable_page();
314                 if (!context) {
315                         spin_unlock_irqrestore(&iommu->lock, flags);
316                         return NULL;
317                 }
318                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
319                 phy_addr = virt_to_phys((void *)context);
320                 set_root_value(root, phy_addr);
321                 set_root_present(root);
322                 __iommu_flush_cache(iommu, root, sizeof(*root));
323         }
324         spin_unlock_irqrestore(&iommu->lock, flags);
325         return &context[devfn];
326 }
327
328 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
329 {
330         struct root_entry *root;
331         struct context_entry *context;
332         int ret;
333         unsigned long flags;
334
335         spin_lock_irqsave(&iommu->lock, flags);
336         root = &iommu->root_entry[bus];
337         context = get_context_addr_from_root(root);
338         if (!context) {
339                 ret = 0;
340                 goto out;
341         }
342         ret = context_present(context[devfn]);
343 out:
344         spin_unlock_irqrestore(&iommu->lock, flags);
345         return ret;
346 }
347
348 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
349 {
350         struct root_entry *root;
351         struct context_entry *context;
352         unsigned long flags;
353
354         spin_lock_irqsave(&iommu->lock, flags);
355         root = &iommu->root_entry[bus];
356         context = get_context_addr_from_root(root);
357         if (context) {
358                 context_clear_entry(context[devfn]);
359                 __iommu_flush_cache(iommu, &context[devfn], \
360                         sizeof(*context));
361         }
362         spin_unlock_irqrestore(&iommu->lock, flags);
363 }
364
365 static void free_context_table(struct intel_iommu *iommu)
366 {
367         struct root_entry *root;
368         int i;
369         unsigned long flags;
370         struct context_entry *context;
371
372         spin_lock_irqsave(&iommu->lock, flags);
373         if (!iommu->root_entry) {
374                 goto out;
375         }
376         for (i = 0; i < ROOT_ENTRY_NR; i++) {
377                 root = &iommu->root_entry[i];
378                 context = get_context_addr_from_root(root);
379                 if (context)
380                         free_pgtable_page(context);
381         }
382         free_pgtable_page(iommu->root_entry);
383         iommu->root_entry = NULL;
384 out:
385         spin_unlock_irqrestore(&iommu->lock, flags);
386 }
387
388 /* page table handling */
389 #define LEVEL_STRIDE            (9)
390 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
391
392 static inline int agaw_to_level(int agaw)
393 {
394         return agaw + 2;
395 }
396
397 static inline int agaw_to_width(int agaw)
398 {
399         return 30 + agaw * LEVEL_STRIDE;
400
401 }
402
403 static inline int width_to_agaw(int width)
404 {
405         return (width - 30) / LEVEL_STRIDE;
406 }
407
408 static inline unsigned int level_to_offset_bits(int level)
409 {
410         return (12 + (level - 1) * LEVEL_STRIDE);
411 }
412
413 static inline int address_level_offset(u64 addr, int level)
414 {
415         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
416 }
417
418 static inline u64 level_mask(int level)
419 {
420         return ((u64)-1 << level_to_offset_bits(level));
421 }
422
423 static inline u64 level_size(int level)
424 {
425         return ((u64)1 << level_to_offset_bits(level));
426 }
427
428 static inline u64 align_to_level(u64 addr, int level)
429 {
430         return ((addr + level_size(level) - 1) & level_mask(level));
431 }
432
433 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
434 {
435         int addr_width = agaw_to_width(domain->agaw);
436         struct dma_pte *parent, *pte = NULL;
437         int level = agaw_to_level(domain->agaw);
438         int offset;
439         unsigned long flags;
440
441         BUG_ON(!domain->pgd);
442
443         addr &= (((u64)1) << addr_width) - 1;
444         parent = domain->pgd;
445
446         spin_lock_irqsave(&domain->mapping_lock, flags);
447         while (level > 0) {
448                 void *tmp_page;
449
450                 offset = address_level_offset(addr, level);
451                 pte = &parent[offset];
452                 if (level == 1)
453                         break;
454
455                 if (!dma_pte_present(*pte)) {
456                         tmp_page = alloc_pgtable_page();
457
458                         if (!tmp_page) {
459                                 spin_unlock_irqrestore(&domain->mapping_lock,
460                                         flags);
461                                 return NULL;
462                         }
463                         __iommu_flush_cache(domain->iommu, tmp_page,
464                                         PAGE_SIZE);
465                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
466                         /*
467                          * high level table always sets r/w, last level page
468                          * table control read/write
469                          */
470                         dma_set_pte_readable(*pte);
471                         dma_set_pte_writable(*pte);
472                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
473                 }
474                 parent = phys_to_virt(dma_pte_addr(*pte));
475                 level--;
476         }
477
478         spin_unlock_irqrestore(&domain->mapping_lock, flags);
479         return pte;
480 }
481
482 /* return address's pte at specific level */
483 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
484                 int level)
485 {
486         struct dma_pte *parent, *pte = NULL;
487         int total = agaw_to_level(domain->agaw);
488         int offset;
489
490         parent = domain->pgd;
491         while (level <= total) {
492                 offset = address_level_offset(addr, total);
493                 pte = &parent[offset];
494                 if (level == total)
495                         return pte;
496
497                 if (!dma_pte_present(*pte))
498                         break;
499                 parent = phys_to_virt(dma_pte_addr(*pte));
500                 total--;
501         }
502         return NULL;
503 }
504
505 /* clear one page's page table */
506 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
507 {
508         struct dma_pte *pte = NULL;
509
510         /* get last level pte */
511         pte = dma_addr_level_pte(domain, addr, 1);
512
513         if (pte) {
514                 dma_clear_pte(*pte);
515                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
516         }
517 }
518
519 /* clear last level pte, a tlb flush should be followed */
520 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
521 {
522         int addr_width = agaw_to_width(domain->agaw);
523
524         start &= (((u64)1) << addr_width) - 1;
525         end &= (((u64)1) << addr_width) - 1;
526         /* in case it's partial page */
527         start = PAGE_ALIGN(start);
528         end &= PAGE_MASK;
529
530         /* we don't need lock here, nobody else touches the iova range */
531         while (start < end) {
532                 dma_pte_clear_one(domain, start);
533                 start += VTD_PAGE_SIZE;
534         }
535 }
536
537 /* free page table pages. last level pte should already be cleared */
538 static void dma_pte_free_pagetable(struct dmar_domain *domain,
539         u64 start, u64 end)
540 {
541         int addr_width = agaw_to_width(domain->agaw);
542         struct dma_pte *pte;
543         int total = agaw_to_level(domain->agaw);
544         int level;
545         u64 tmp;
546
547         start &= (((u64)1) << addr_width) - 1;
548         end &= (((u64)1) << addr_width) - 1;
549
550         /* we don't need lock here, nobody else touches the iova range */
551         level = 2;
552         while (level <= total) {
553                 tmp = align_to_level(start, level);
554                 if (tmp >= end || (tmp + level_size(level) > end))
555                         return;
556
557                 while (tmp < end) {
558                         pte = dma_addr_level_pte(domain, tmp, level);
559                         if (pte) {
560                                 free_pgtable_page(
561                                         phys_to_virt(dma_pte_addr(*pte)));
562                                 dma_clear_pte(*pte);
563                                 __iommu_flush_cache(domain->iommu,
564                                                 pte, sizeof(*pte));
565                         }
566                         tmp += level_size(level);
567                 }
568                 level++;
569         }
570         /* free pgd */
571         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
572                 free_pgtable_page(domain->pgd);
573                 domain->pgd = NULL;
574         }
575 }
576
577 /* iommu handling */
578 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
579 {
580         struct root_entry *root;
581         unsigned long flags;
582
583         root = (struct root_entry *)alloc_pgtable_page();
584         if (!root)
585                 return -ENOMEM;
586
587         __iommu_flush_cache(iommu, root, ROOT_SIZE);
588
589         spin_lock_irqsave(&iommu->lock, flags);
590         iommu->root_entry = root;
591         spin_unlock_irqrestore(&iommu->lock, flags);
592
593         return 0;
594 }
595
596 static void iommu_set_root_entry(struct intel_iommu *iommu)
597 {
598         void *addr;
599         u32 cmd, sts;
600         unsigned long flag;
601
602         addr = iommu->root_entry;
603
604         spin_lock_irqsave(&iommu->register_lock, flag);
605         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
606
607         cmd = iommu->gcmd | DMA_GCMD_SRTP;
608         writel(cmd, iommu->reg + DMAR_GCMD_REG);
609
610         /* Make sure hardware complete it */
611         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
612                 readl, (sts & DMA_GSTS_RTPS), sts);
613
614         spin_unlock_irqrestore(&iommu->register_lock, flag);
615 }
616
617 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
618 {
619         u32 val;
620         unsigned long flag;
621
622         if (!cap_rwbf(iommu->cap))
623                 return;
624         val = iommu->gcmd | DMA_GCMD_WBF;
625
626         spin_lock_irqsave(&iommu->register_lock, flag);
627         writel(val, iommu->reg + DMAR_GCMD_REG);
628
629         /* Make sure hardware complete it */
630         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
631                         readl, (!(val & DMA_GSTS_WBFS)), val);
632
633         spin_unlock_irqrestore(&iommu->register_lock, flag);
634 }
635
636 /* return value determine if we need a write buffer flush */
637 static int __iommu_flush_context(struct intel_iommu *iommu,
638         u16 did, u16 source_id, u8 function_mask, u64 type,
639         int non_present_entry_flush)
640 {
641         u64 val = 0;
642         unsigned long flag;
643
644         /*
645          * In the non-present entry flush case, if hardware doesn't cache
646          * non-present entry we do nothing and if hardware cache non-present
647          * entry, we flush entries of domain 0 (the domain id is used to cache
648          * any non-present entries)
649          */
650         if (non_present_entry_flush) {
651                 if (!cap_caching_mode(iommu->cap))
652                         return 1;
653                 else
654                         did = 0;
655         }
656
657         switch (type) {
658         case DMA_CCMD_GLOBAL_INVL:
659                 val = DMA_CCMD_GLOBAL_INVL;
660                 break;
661         case DMA_CCMD_DOMAIN_INVL:
662                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
663                 break;
664         case DMA_CCMD_DEVICE_INVL:
665                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
666                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
667                 break;
668         default:
669                 BUG();
670         }
671         val |= DMA_CCMD_ICC;
672
673         spin_lock_irqsave(&iommu->register_lock, flag);
674         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
675
676         /* Make sure hardware complete it */
677         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
678                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
679
680         spin_unlock_irqrestore(&iommu->register_lock, flag);
681
682         /* flush context entry will implicitly flush write buffer */
683         return 0;
684 }
685
686 /* return value determine if we need a write buffer flush */
687 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
688         u64 addr, unsigned int size_order, u64 type,
689         int non_present_entry_flush)
690 {
691         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
692         u64 val = 0, val_iva = 0;
693         unsigned long flag;
694
695         /*
696          * In the non-present entry flush case, if hardware doesn't cache
697          * non-present entry we do nothing and if hardware cache non-present
698          * entry, we flush entries of domain 0 (the domain id is used to cache
699          * any non-present entries)
700          */
701         if (non_present_entry_flush) {
702                 if (!cap_caching_mode(iommu->cap))
703                         return 1;
704                 else
705                         did = 0;
706         }
707
708         switch (type) {
709         case DMA_TLB_GLOBAL_FLUSH:
710                 /* global flush doesn't need set IVA_REG */
711                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
712                 break;
713         case DMA_TLB_DSI_FLUSH:
714                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
715                 break;
716         case DMA_TLB_PSI_FLUSH:
717                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
718                 /* Note: always flush non-leaf currently */
719                 val_iva = size_order | addr;
720                 break;
721         default:
722                 BUG();
723         }
724         /* Note: set drain read/write */
725 #if 0
726         /*
727          * This is probably to be super secure.. Looks like we can
728          * ignore it without any impact.
729          */
730         if (cap_read_drain(iommu->cap))
731                 val |= DMA_TLB_READ_DRAIN;
732 #endif
733         if (cap_write_drain(iommu->cap))
734                 val |= DMA_TLB_WRITE_DRAIN;
735
736         spin_lock_irqsave(&iommu->register_lock, flag);
737         /* Note: Only uses first TLB reg currently */
738         if (val_iva)
739                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
740         dmar_writeq(iommu->reg + tlb_offset + 8, val);
741
742         /* Make sure hardware complete it */
743         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
744                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
745
746         spin_unlock_irqrestore(&iommu->register_lock, flag);
747
748         /* check IOTLB invalidation granularity */
749         if (DMA_TLB_IAIG(val) == 0)
750                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
751         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
752                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
753                         (unsigned long long)DMA_TLB_IIRG(type),
754                         (unsigned long long)DMA_TLB_IAIG(val));
755         /* flush iotlb entry will implicitly flush write buffer */
756         return 0;
757 }
758
759 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
760         u64 addr, unsigned int pages, int non_present_entry_flush)
761 {
762         unsigned int mask;
763
764         BUG_ON(addr & (~VTD_PAGE_MASK));
765         BUG_ON(pages == 0);
766
767         /* Fallback to domain selective flush if no PSI support */
768         if (!cap_pgsel_inv(iommu->cap))
769                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
770                                                 DMA_TLB_DSI_FLUSH,
771                                                 non_present_entry_flush);
772
773         /*
774          * PSI requires page size to be 2 ^ x, and the base address is naturally
775          * aligned to the size
776          */
777         mask = ilog2(__roundup_pow_of_two(pages));
778         /* Fallback to domain selective flush if size is too big */
779         if (mask > cap_max_amask_val(iommu->cap))
780                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
781                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
782
783         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
784                                         DMA_TLB_PSI_FLUSH,
785                                         non_present_entry_flush);
786 }
787
788 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
789 {
790         u32 pmen;
791         unsigned long flags;
792
793         spin_lock_irqsave(&iommu->register_lock, flags);
794         pmen = readl(iommu->reg + DMAR_PMEN_REG);
795         pmen &= ~DMA_PMEN_EPM;
796         writel(pmen, iommu->reg + DMAR_PMEN_REG);
797
798         /* wait for the protected region status bit to clear */
799         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
800                 readl, !(pmen & DMA_PMEN_PRS), pmen);
801
802         spin_unlock_irqrestore(&iommu->register_lock, flags);
803 }
804
805 static int iommu_enable_translation(struct intel_iommu *iommu)
806 {
807         u32 sts;
808         unsigned long flags;
809
810         spin_lock_irqsave(&iommu->register_lock, flags);
811         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
812
813         /* Make sure hardware complete it */
814         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
815                 readl, (sts & DMA_GSTS_TES), sts);
816
817         iommu->gcmd |= DMA_GCMD_TE;
818         spin_unlock_irqrestore(&iommu->register_lock, flags);
819         return 0;
820 }
821
822 static int iommu_disable_translation(struct intel_iommu *iommu)
823 {
824         u32 sts;
825         unsigned long flag;
826
827         spin_lock_irqsave(&iommu->register_lock, flag);
828         iommu->gcmd &= ~DMA_GCMD_TE;
829         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
830
831         /* Make sure hardware complete it */
832         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
833                 readl, (!(sts & DMA_GSTS_TES)), sts);
834
835         spin_unlock_irqrestore(&iommu->register_lock, flag);
836         return 0;
837 }
838
839 /* iommu interrupt handling. Most stuff are MSI-like. */
840
841 static const char *fault_reason_strings[] =
842 {
843         "Software",
844         "Present bit in root entry is clear",
845         "Present bit in context entry is clear",
846         "Invalid context entry",
847         "Access beyond MGAW",
848         "PTE Write access is not set",
849         "PTE Read access is not set",
850         "Next page table ptr is invalid",
851         "Root table address invalid",
852         "Context table ptr is invalid",
853         "non-zero reserved fields in RTP",
854         "non-zero reserved fields in CTP",
855         "non-zero reserved fields in PTE",
856 };
857 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
858
859 const char *dmar_get_fault_reason(u8 fault_reason)
860 {
861         if (fault_reason > MAX_FAULT_REASON_IDX)
862                 return "Unknown";
863         else
864                 return fault_reason_strings[fault_reason];
865 }
866
867 void dmar_msi_unmask(unsigned int irq)
868 {
869         struct intel_iommu *iommu = get_irq_data(irq);
870         unsigned long flag;
871
872         /* unmask it */
873         spin_lock_irqsave(&iommu->register_lock, flag);
874         writel(0, iommu->reg + DMAR_FECTL_REG);
875         /* Read a reg to force flush the post write */
876         readl(iommu->reg + DMAR_FECTL_REG);
877         spin_unlock_irqrestore(&iommu->register_lock, flag);
878 }
879
880 void dmar_msi_mask(unsigned int irq)
881 {
882         unsigned long flag;
883         struct intel_iommu *iommu = get_irq_data(irq);
884
885         /* mask it */
886         spin_lock_irqsave(&iommu->register_lock, flag);
887         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
888         /* Read a reg to force flush the post write */
889         readl(iommu->reg + DMAR_FECTL_REG);
890         spin_unlock_irqrestore(&iommu->register_lock, flag);
891 }
892
893 void dmar_msi_write(int irq, struct msi_msg *msg)
894 {
895         struct intel_iommu *iommu = get_irq_data(irq);
896         unsigned long flag;
897
898         spin_lock_irqsave(&iommu->register_lock, flag);
899         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
900         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
901         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
902         spin_unlock_irqrestore(&iommu->register_lock, flag);
903 }
904
905 void dmar_msi_read(int irq, struct msi_msg *msg)
906 {
907         struct intel_iommu *iommu = get_irq_data(irq);
908         unsigned long flag;
909
910         spin_lock_irqsave(&iommu->register_lock, flag);
911         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
912         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
913         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
914         spin_unlock_irqrestore(&iommu->register_lock, flag);
915 }
916
917 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
918                 u8 fault_reason, u16 source_id, unsigned long long addr)
919 {
920         const char *reason;
921
922         reason = dmar_get_fault_reason(fault_reason);
923
924         printk(KERN_ERR
925                 "DMAR:[%s] Request device [%02x:%02x.%d] "
926                 "fault addr %llx \n"
927                 "DMAR:[fault reason %02d] %s\n",
928                 (type ? "DMA Read" : "DMA Write"),
929                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
930                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
931         return 0;
932 }
933
934 #define PRIMARY_FAULT_REG_LEN (16)
935 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
936 {
937         struct intel_iommu *iommu = dev_id;
938         int reg, fault_index;
939         u32 fault_status;
940         unsigned long flag;
941
942         spin_lock_irqsave(&iommu->register_lock, flag);
943         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
944
945         /* TBD: ignore advanced fault log currently */
946         if (!(fault_status & DMA_FSTS_PPF))
947                 goto clear_overflow;
948
949         fault_index = dma_fsts_fault_record_index(fault_status);
950         reg = cap_fault_reg_offset(iommu->cap);
951         while (1) {
952                 u8 fault_reason;
953                 u16 source_id;
954                 u64 guest_addr;
955                 int type;
956                 u32 data;
957
958                 /* highest 32 bits */
959                 data = readl(iommu->reg + reg +
960                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
961                 if (!(data & DMA_FRCD_F))
962                         break;
963
964                 fault_reason = dma_frcd_fault_reason(data);
965                 type = dma_frcd_type(data);
966
967                 data = readl(iommu->reg + reg +
968                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
969                 source_id = dma_frcd_source_id(data);
970
971                 guest_addr = dmar_readq(iommu->reg + reg +
972                                 fault_index * PRIMARY_FAULT_REG_LEN);
973                 guest_addr = dma_frcd_page_addr(guest_addr);
974                 /* clear the fault */
975                 writel(DMA_FRCD_F, iommu->reg + reg +
976                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
977
978                 spin_unlock_irqrestore(&iommu->register_lock, flag);
979
980                 iommu_page_fault_do_one(iommu, type, fault_reason,
981                                 source_id, guest_addr);
982
983                 fault_index++;
984                 if (fault_index > cap_num_fault_regs(iommu->cap))
985                         fault_index = 0;
986                 spin_lock_irqsave(&iommu->register_lock, flag);
987         }
988 clear_overflow:
989         /* clear primary fault overflow */
990         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
991         if (fault_status & DMA_FSTS_PFO)
992                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
993
994         spin_unlock_irqrestore(&iommu->register_lock, flag);
995         return IRQ_HANDLED;
996 }
997
998 int dmar_set_interrupt(struct intel_iommu *iommu)
999 {
1000         int irq, ret;
1001
1002         irq = create_irq();
1003         if (!irq) {
1004                 printk(KERN_ERR "IOMMU: no free vectors\n");
1005                 return -EINVAL;
1006         }
1007
1008         set_irq_data(irq, iommu);
1009         iommu->irq = irq;
1010
1011         ret = arch_setup_dmar_msi(irq);
1012         if (ret) {
1013                 set_irq_data(irq, NULL);
1014                 iommu->irq = 0;
1015                 destroy_irq(irq);
1016                 return 0;
1017         }
1018
1019         /* Force fault register is cleared */
1020         iommu_page_fault(irq, iommu);
1021
1022         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1023         if (ret)
1024                 printk(KERN_ERR "IOMMU: can't request irq\n");
1025         return ret;
1026 }
1027
1028 static int iommu_init_domains(struct intel_iommu *iommu)
1029 {
1030         unsigned long ndomains;
1031         unsigned long nlongs;
1032
1033         ndomains = cap_ndoms(iommu->cap);
1034         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1035         nlongs = BITS_TO_LONGS(ndomains);
1036
1037         /* TBD: there might be 64K domains,
1038          * consider other allocation for future chip
1039          */
1040         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1041         if (!iommu->domain_ids) {
1042                 printk(KERN_ERR "Allocating domain id array failed\n");
1043                 return -ENOMEM;
1044         }
1045         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1046                         GFP_KERNEL);
1047         if (!iommu->domains) {
1048                 printk(KERN_ERR "Allocating domain array failed\n");
1049                 kfree(iommu->domain_ids);
1050                 return -ENOMEM;
1051         }
1052
1053         spin_lock_init(&iommu->lock);
1054
1055         /*
1056          * if Caching mode is set, then invalid translations are tagged
1057          * with domainid 0. Hence we need to pre-allocate it.
1058          */
1059         if (cap_caching_mode(iommu->cap))
1060                 set_bit(0, iommu->domain_ids);
1061         return 0;
1062 }
1063
1064
1065 static void domain_exit(struct dmar_domain *domain);
1066
1067 void free_dmar_iommu(struct intel_iommu *iommu)
1068 {
1069         struct dmar_domain *domain;
1070         int i;
1071
1072         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1073         for (; i < cap_ndoms(iommu->cap); ) {
1074                 domain = iommu->domains[i];
1075                 clear_bit(i, iommu->domain_ids);
1076                 domain_exit(domain);
1077                 i = find_next_bit(iommu->domain_ids,
1078                         cap_ndoms(iommu->cap), i+1);
1079         }
1080
1081         if (iommu->gcmd & DMA_GCMD_TE)
1082                 iommu_disable_translation(iommu);
1083
1084         if (iommu->irq) {
1085                 set_irq_data(iommu->irq, NULL);
1086                 /* This will mask the irq */
1087                 free_irq(iommu->irq, iommu);
1088                 destroy_irq(iommu->irq);
1089         }
1090
1091         kfree(iommu->domains);
1092         kfree(iommu->domain_ids);
1093
1094         /* free context mapping */
1095         free_context_table(iommu);
1096 }
1097
1098 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1099 {
1100         unsigned long num;
1101         unsigned long ndomains;
1102         struct dmar_domain *domain;
1103         unsigned long flags;
1104
1105         domain = alloc_domain_mem();
1106         if (!domain)
1107                 return NULL;
1108
1109         ndomains = cap_ndoms(iommu->cap);
1110
1111         spin_lock_irqsave(&iommu->lock, flags);
1112         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1113         if (num >= ndomains) {
1114                 spin_unlock_irqrestore(&iommu->lock, flags);
1115                 free_domain_mem(domain);
1116                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1117                 return NULL;
1118         }
1119
1120         set_bit(num, iommu->domain_ids);
1121         domain->id = num;
1122         domain->iommu = iommu;
1123         iommu->domains[num] = domain;
1124         spin_unlock_irqrestore(&iommu->lock, flags);
1125
1126         return domain;
1127 }
1128
1129 static void iommu_free_domain(struct dmar_domain *domain)
1130 {
1131         unsigned long flags;
1132
1133         spin_lock_irqsave(&domain->iommu->lock, flags);
1134         clear_bit(domain->id, domain->iommu->domain_ids);
1135         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1136 }
1137
1138 static struct iova_domain reserved_iova_list;
1139 static struct lock_class_key reserved_alloc_key;
1140 static struct lock_class_key reserved_rbtree_key;
1141
1142 static void dmar_init_reserved_ranges(void)
1143 {
1144         struct pci_dev *pdev = NULL;
1145         struct iova *iova;
1146         int i;
1147         u64 addr, size;
1148
1149         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1150
1151         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1152                 &reserved_alloc_key);
1153         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1154                 &reserved_rbtree_key);
1155
1156         /* IOAPIC ranges shouldn't be accessed by DMA */
1157         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1158                 IOVA_PFN(IOAPIC_RANGE_END));
1159         if (!iova)
1160                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1161
1162         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1163         for_each_pci_dev(pdev) {
1164                 struct resource *r;
1165
1166                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1167                         r = &pdev->resource[i];
1168                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1169                                 continue;
1170                         addr = r->start;
1171                         addr &= PAGE_MASK;
1172                         size = r->end - addr;
1173                         size = PAGE_ALIGN(size);
1174                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1175                                 IOVA_PFN(size + addr) - 1);
1176                         if (!iova)
1177                                 printk(KERN_ERR "Reserve iova failed\n");
1178                 }
1179         }
1180
1181 }
1182
1183 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1184 {
1185         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1186 }
1187
1188 static inline int guestwidth_to_adjustwidth(int gaw)
1189 {
1190         int agaw;
1191         int r = (gaw - 12) % 9;
1192
1193         if (r == 0)
1194                 agaw = gaw;
1195         else
1196                 agaw = gaw + 9 - r;
1197         if (agaw > 64)
1198                 agaw = 64;
1199         return agaw;
1200 }
1201
1202 static int domain_init(struct dmar_domain *domain, int guest_width)
1203 {
1204         struct intel_iommu *iommu;
1205         int adjust_width, agaw;
1206         unsigned long sagaw;
1207
1208         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1209         spin_lock_init(&domain->mapping_lock);
1210
1211         domain_reserve_special_ranges(domain);
1212
1213         /* calculate AGAW */
1214         iommu = domain->iommu;
1215         if (guest_width > cap_mgaw(iommu->cap))
1216                 guest_width = cap_mgaw(iommu->cap);
1217         domain->gaw = guest_width;
1218         adjust_width = guestwidth_to_adjustwidth(guest_width);
1219         agaw = width_to_agaw(adjust_width);
1220         sagaw = cap_sagaw(iommu->cap);
1221         if (!test_bit(agaw, &sagaw)) {
1222                 /* hardware doesn't support it, choose a bigger one */
1223                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1224                 agaw = find_next_bit(&sagaw, 5, agaw);
1225                 if (agaw >= 5)
1226                         return -ENODEV;
1227         }
1228         domain->agaw = agaw;
1229         INIT_LIST_HEAD(&domain->devices);
1230
1231         /* always allocate the top pgd */
1232         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1233         if (!domain->pgd)
1234                 return -ENOMEM;
1235         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1236         return 0;
1237 }
1238
1239 static void domain_exit(struct dmar_domain *domain)
1240 {
1241         u64 end;
1242
1243         /* Domain 0 is reserved, so dont process it */
1244         if (!domain)
1245                 return;
1246
1247         domain_remove_dev_info(domain);
1248         /* destroy iovas */
1249         put_iova_domain(&domain->iovad);
1250         end = DOMAIN_MAX_ADDR(domain->gaw);
1251         end = end & (~PAGE_MASK);
1252
1253         /* clear ptes */
1254         dma_pte_clear_range(domain, 0, end);
1255
1256         /* free page tables */
1257         dma_pte_free_pagetable(domain, 0, end);
1258
1259         iommu_free_domain(domain);
1260         free_domain_mem(domain);
1261 }
1262
1263 static int domain_context_mapping_one(struct dmar_domain *domain,
1264                 u8 bus, u8 devfn)
1265 {
1266         struct context_entry *context;
1267         struct intel_iommu *iommu = domain->iommu;
1268         unsigned long flags;
1269
1270         pr_debug("Set context mapping for %02x:%02x.%d\n",
1271                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1272         BUG_ON(!domain->pgd);
1273         context = device_to_context_entry(iommu, bus, devfn);
1274         if (!context)
1275                 return -ENOMEM;
1276         spin_lock_irqsave(&iommu->lock, flags);
1277         if (context_present(*context)) {
1278                 spin_unlock_irqrestore(&iommu->lock, flags);
1279                 return 0;
1280         }
1281
1282         context_set_domain_id(*context, domain->id);
1283         context_set_address_width(*context, domain->agaw);
1284         context_set_address_root(*context, virt_to_phys(domain->pgd));
1285         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1286         context_set_fault_enable(*context);
1287         context_set_present(*context);
1288         __iommu_flush_cache(iommu, context, sizeof(*context));
1289
1290         /* it's a non-present to present mapping */
1291         if (iommu->flush.flush_context(iommu, domain->id,
1292                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1293                 DMA_CCMD_DEVICE_INVL, 1))
1294                 iommu_flush_write_buffer(iommu);
1295         else
1296                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1297
1298         spin_unlock_irqrestore(&iommu->lock, flags);
1299         return 0;
1300 }
1301
1302 static int
1303 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1304 {
1305         int ret;
1306         struct pci_dev *tmp, *parent;
1307
1308         ret = domain_context_mapping_one(domain, pdev->bus->number,
1309                 pdev->devfn);
1310         if (ret)
1311                 return ret;
1312
1313         /* dependent device mapping */
1314         tmp = pci_find_upstream_pcie_bridge(pdev);
1315         if (!tmp)
1316                 return 0;
1317         /* Secondary interface's bus number and devfn 0 */
1318         parent = pdev->bus->self;
1319         while (parent != tmp) {
1320                 ret = domain_context_mapping_one(domain, parent->bus->number,
1321                         parent->devfn);
1322                 if (ret)
1323                         return ret;
1324                 parent = parent->bus->self;
1325         }
1326         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1327                 return domain_context_mapping_one(domain,
1328                         tmp->subordinate->number, 0);
1329         else /* this is a legacy PCI bridge */
1330                 return domain_context_mapping_one(domain,
1331                         tmp->bus->number, tmp->devfn);
1332 }
1333
1334 static int domain_context_mapped(struct dmar_domain *domain,
1335         struct pci_dev *pdev)
1336 {
1337         int ret;
1338         struct pci_dev *tmp, *parent;
1339
1340         ret = device_context_mapped(domain->iommu,
1341                 pdev->bus->number, pdev->devfn);
1342         if (!ret)
1343                 return ret;
1344         /* dependent device mapping */
1345         tmp = pci_find_upstream_pcie_bridge(pdev);
1346         if (!tmp)
1347                 return ret;
1348         /* Secondary interface's bus number and devfn 0 */
1349         parent = pdev->bus->self;
1350         while (parent != tmp) {
1351                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1352                         parent->devfn);
1353                 if (!ret)
1354                         return ret;
1355                 parent = parent->bus->self;
1356         }
1357         if (tmp->is_pcie)
1358                 return device_context_mapped(domain->iommu,
1359                         tmp->subordinate->number, 0);
1360         else
1361                 return device_context_mapped(domain->iommu,
1362                         tmp->bus->number, tmp->devfn);
1363 }
1364
1365 static int
1366 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1367                         u64 hpa, size_t size, int prot)
1368 {
1369         u64 start_pfn, end_pfn;
1370         struct dma_pte *pte;
1371         int index;
1372         int addr_width = agaw_to_width(domain->agaw);
1373
1374         hpa &= (((u64)1) << addr_width) - 1;
1375
1376         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1377                 return -EINVAL;
1378         iova &= PAGE_MASK;
1379         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1380         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1381         index = 0;
1382         while (start_pfn < end_pfn) {
1383                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1384                 if (!pte)
1385                         return -ENOMEM;
1386                 /* We don't need lock here, nobody else
1387                  * touches the iova range
1388                  */
1389                 BUG_ON(dma_pte_addr(*pte));
1390                 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1391                 dma_set_pte_prot(*pte, prot);
1392                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1393                 start_pfn++;
1394                 index++;
1395         }
1396         return 0;
1397 }
1398
1399 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1400 {
1401         clear_context_table(domain->iommu, bus, devfn);
1402         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1403                                            DMA_CCMD_GLOBAL_INVL, 0);
1404         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1405                                          DMA_TLB_GLOBAL_FLUSH, 0);
1406 }
1407
1408 static void domain_remove_dev_info(struct dmar_domain *domain)
1409 {
1410         struct device_domain_info *info;
1411         unsigned long flags;
1412
1413         spin_lock_irqsave(&device_domain_lock, flags);
1414         while (!list_empty(&domain->devices)) {
1415                 info = list_entry(domain->devices.next,
1416                         struct device_domain_info, link);
1417                 list_del(&info->link);
1418                 list_del(&info->global);
1419                 if (info->dev)
1420                         info->dev->dev.archdata.iommu = NULL;
1421                 spin_unlock_irqrestore(&device_domain_lock, flags);
1422
1423                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1424                 free_devinfo_mem(info);
1425
1426                 spin_lock_irqsave(&device_domain_lock, flags);
1427         }
1428         spin_unlock_irqrestore(&device_domain_lock, flags);
1429 }
1430
1431 /*
1432  * find_domain
1433  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1434  */
1435 static struct dmar_domain *
1436 find_domain(struct pci_dev *pdev)
1437 {
1438         struct device_domain_info *info;
1439
1440         /* No lock here, assumes no domain exit in normal case */
1441         info = pdev->dev.archdata.iommu;
1442         if (info)
1443                 return info->domain;
1444         return NULL;
1445 }
1446
1447 /* domain is initialized */
1448 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1449 {
1450         struct dmar_domain *domain, *found = NULL;
1451         struct intel_iommu *iommu;
1452         struct dmar_drhd_unit *drhd;
1453         struct device_domain_info *info, *tmp;
1454         struct pci_dev *dev_tmp;
1455         unsigned long flags;
1456         int bus = 0, devfn = 0;
1457
1458         domain = find_domain(pdev);
1459         if (domain)
1460                 return domain;
1461
1462         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1463         if (dev_tmp) {
1464                 if (dev_tmp->is_pcie) {
1465                         bus = dev_tmp->subordinate->number;
1466                         devfn = 0;
1467                 } else {
1468                         bus = dev_tmp->bus->number;
1469                         devfn = dev_tmp->devfn;
1470                 }
1471                 spin_lock_irqsave(&device_domain_lock, flags);
1472                 list_for_each_entry(info, &device_domain_list, global) {
1473                         if (info->bus == bus && info->devfn == devfn) {
1474                                 found = info->domain;
1475                                 break;
1476                         }
1477                 }
1478                 spin_unlock_irqrestore(&device_domain_lock, flags);
1479                 /* pcie-pci bridge already has a domain, uses it */
1480                 if (found) {
1481                         domain = found;
1482                         goto found_domain;
1483                 }
1484         }
1485
1486         /* Allocate new domain for the device */
1487         drhd = dmar_find_matched_drhd_unit(pdev);
1488         if (!drhd) {
1489                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1490                         pci_name(pdev));
1491                 return NULL;
1492         }
1493         iommu = drhd->iommu;
1494
1495         domain = iommu_alloc_domain(iommu);
1496         if (!domain)
1497                 goto error;
1498
1499         if (domain_init(domain, gaw)) {
1500                 domain_exit(domain);
1501                 goto error;
1502         }
1503
1504         /* register pcie-to-pci device */
1505         if (dev_tmp) {
1506                 info = alloc_devinfo_mem();
1507                 if (!info) {
1508                         domain_exit(domain);
1509                         goto error;
1510                 }
1511                 info->bus = bus;
1512                 info->devfn = devfn;
1513                 info->dev = NULL;
1514                 info->domain = domain;
1515                 /* This domain is shared by devices under p2p bridge */
1516                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1517
1518                 /* pcie-to-pci bridge already has a domain, uses it */
1519                 found = NULL;
1520                 spin_lock_irqsave(&device_domain_lock, flags);
1521                 list_for_each_entry(tmp, &device_domain_list, global) {
1522                         if (tmp->bus == bus && tmp->devfn == devfn) {
1523                                 found = tmp->domain;
1524                                 break;
1525                         }
1526                 }
1527                 if (found) {
1528                         free_devinfo_mem(info);
1529                         domain_exit(domain);
1530                         domain = found;
1531                 } else {
1532                         list_add(&info->link, &domain->devices);
1533                         list_add(&info->global, &device_domain_list);
1534                 }
1535                 spin_unlock_irqrestore(&device_domain_lock, flags);
1536         }
1537
1538 found_domain:
1539         info = alloc_devinfo_mem();
1540         if (!info)
1541                 goto error;
1542         info->bus = pdev->bus->number;
1543         info->devfn = pdev->devfn;
1544         info->dev = pdev;
1545         info->domain = domain;
1546         spin_lock_irqsave(&device_domain_lock, flags);
1547         /* somebody is fast */
1548         found = find_domain(pdev);
1549         if (found != NULL) {
1550                 spin_unlock_irqrestore(&device_domain_lock, flags);
1551                 if (found != domain) {
1552                         domain_exit(domain);
1553                         domain = found;
1554                 }
1555                 free_devinfo_mem(info);
1556                 return domain;
1557         }
1558         list_add(&info->link, &domain->devices);
1559         list_add(&info->global, &device_domain_list);
1560         pdev->dev.archdata.iommu = info;
1561         spin_unlock_irqrestore(&device_domain_lock, flags);
1562         return domain;
1563 error:
1564         /* recheck it here, maybe others set it */
1565         return find_domain(pdev);
1566 }
1567
1568 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1569                                       unsigned long long start,
1570                                       unsigned long long end)
1571 {
1572         struct dmar_domain *domain;
1573         unsigned long size;
1574         unsigned long long base;
1575         int ret;
1576
1577         printk(KERN_INFO
1578                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1579                 pci_name(pdev), start, end);
1580         /* page table init */
1581         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1582         if (!domain)
1583                 return -ENOMEM;
1584
1585         /* The address might not be aligned */
1586         base = start & PAGE_MASK;
1587         size = end - base;
1588         size = PAGE_ALIGN(size);
1589         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1590                         IOVA_PFN(base + size) - 1)) {
1591                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1592                 ret = -ENOMEM;
1593                 goto error;
1594         }
1595
1596         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1597                 size, base, pci_name(pdev));
1598         /*
1599          * RMRR range might have overlap with physical memory range,
1600          * clear it first
1601          */
1602         dma_pte_clear_range(domain, base, base + size);
1603
1604         ret = domain_page_mapping(domain, base, base, size,
1605                 DMA_PTE_READ|DMA_PTE_WRITE);
1606         if (ret)
1607                 goto error;
1608
1609         /* context entry init */
1610         ret = domain_context_mapping(domain, pdev);
1611         if (!ret)
1612                 return 0;
1613 error:
1614         domain_exit(domain);
1615         return ret;
1616
1617 }
1618
1619 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1620         struct pci_dev *pdev)
1621 {
1622         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1623                 return 0;
1624         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1625                 rmrr->end_address + 1);
1626 }
1627
1628 #ifdef CONFIG_DMAR_GFX_WA
1629 struct iommu_prepare_data {
1630         struct pci_dev *pdev;
1631         int ret;
1632 };
1633
1634 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1635                                          unsigned long end_pfn, void *datax)
1636 {
1637         struct iommu_prepare_data *data;
1638
1639         data = (struct iommu_prepare_data *)datax;
1640
1641         data->ret = iommu_prepare_identity_map(data->pdev,
1642                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1643         return data->ret;
1644
1645 }
1646
1647 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1648 {
1649         int nid;
1650         struct iommu_prepare_data data;
1651
1652         data.pdev = pdev;
1653         data.ret = 0;
1654
1655         for_each_online_node(nid) {
1656                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1657                 if (data.ret)
1658                         return data.ret;
1659         }
1660         return data.ret;
1661 }
1662
1663 static void __init iommu_prepare_gfx_mapping(void)
1664 {
1665         struct pci_dev *pdev = NULL;
1666         int ret;
1667
1668         for_each_pci_dev(pdev) {
1669                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1670                                 !IS_GFX_DEVICE(pdev))
1671                         continue;
1672                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1673                         pci_name(pdev));
1674                 ret = iommu_prepare_with_active_regions(pdev);
1675                 if (ret)
1676                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1677         }
1678 }
1679 #endif
1680
1681 #ifdef CONFIG_DMAR_FLOPPY_WA
1682 static inline void iommu_prepare_isa(void)
1683 {
1684         struct pci_dev *pdev;
1685         int ret;
1686
1687         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1688         if (!pdev)
1689                 return;
1690
1691         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1692         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1693
1694         if (ret)
1695                 printk("IOMMU: Failed to create 0-64M identity map, "
1696                         "floppy might not work\n");
1697
1698 }
1699 #else
1700 static inline void iommu_prepare_isa(void)
1701 {
1702         return;
1703 }
1704 #endif /* !CONFIG_DMAR_FLPY_WA */
1705
1706 static int __init init_dmars(void)
1707 {
1708         struct dmar_drhd_unit *drhd;
1709         struct dmar_rmrr_unit *rmrr;
1710         struct pci_dev *pdev;
1711         struct intel_iommu *iommu;
1712         int i, ret, unit = 0;
1713
1714         /*
1715          * for each drhd
1716          *    allocate root
1717          *    initialize and program root entry to not present
1718          * endfor
1719          */
1720         for_each_drhd_unit(drhd) {
1721                 g_num_of_iommus++;
1722                 /*
1723                  * lock not needed as this is only incremented in the single
1724                  * threaded kernel __init code path all other access are read
1725                  * only
1726                  */
1727         }
1728
1729         deferred_flush = kzalloc(g_num_of_iommus *
1730                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1731         if (!deferred_flush) {
1732                 ret = -ENOMEM;
1733                 goto error;
1734         }
1735
1736         for_each_drhd_unit(drhd) {
1737                 if (drhd->ignored)
1738                         continue;
1739
1740                 iommu = drhd->iommu;
1741
1742                 ret = iommu_init_domains(iommu);
1743                 if (ret)
1744                         goto error;
1745
1746                 /*
1747                  * TBD:
1748                  * we could share the same root & context tables
1749                  * amoung all IOMMU's. Need to Split it later.
1750                  */
1751                 ret = iommu_alloc_root_entry(iommu);
1752                 if (ret) {
1753                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1754                         goto error;
1755                 }
1756         }
1757
1758         for_each_drhd_unit(drhd) {
1759                 if (drhd->ignored)
1760                         continue;
1761
1762                 iommu = drhd->iommu;
1763                 if (dmar_enable_qi(iommu)) {
1764                         /*
1765                          * Queued Invalidate not enabled, use Register Based
1766                          * Invalidate
1767                          */
1768                         iommu->flush.flush_context = __iommu_flush_context;
1769                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1770                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1771                                "invalidation\n",
1772                                (unsigned long long)drhd->reg_base_addr);
1773                 } else {
1774                         iommu->flush.flush_context = qi_flush_context;
1775                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1776                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1777                                "invalidation\n",
1778                                (unsigned long long)drhd->reg_base_addr);
1779                 }
1780         }
1781
1782         /*
1783          * For each rmrr
1784          *   for each dev attached to rmrr
1785          *   do
1786          *     locate drhd for dev, alloc domain for dev
1787          *     allocate free domain
1788          *     allocate page table entries for rmrr
1789          *     if context not allocated for bus
1790          *           allocate and init context
1791          *           set present in root table for this bus
1792          *     init context with domain, translation etc
1793          *    endfor
1794          * endfor
1795          */
1796         for_each_rmrr_units(rmrr) {
1797                 for (i = 0; i < rmrr->devices_cnt; i++) {
1798                         pdev = rmrr->devices[i];
1799                         /* some BIOS lists non-exist devices in DMAR table */
1800                         if (!pdev)
1801                                 continue;
1802                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1803                         if (ret)
1804                                 printk(KERN_ERR
1805                                  "IOMMU: mapping reserved region failed\n");
1806                 }
1807         }
1808
1809         iommu_prepare_gfx_mapping();
1810
1811         iommu_prepare_isa();
1812
1813         /*
1814          * for each drhd
1815          *   enable fault log
1816          *   global invalidate context cache
1817          *   global invalidate iotlb
1818          *   enable translation
1819          */
1820         for_each_drhd_unit(drhd) {
1821                 if (drhd->ignored)
1822                         continue;
1823                 iommu = drhd->iommu;
1824                 sprintf (iommu->name, "dmar%d", unit++);
1825
1826                 iommu_flush_write_buffer(iommu);
1827
1828                 ret = dmar_set_interrupt(iommu);
1829                 if (ret)
1830                         goto error;
1831
1832                 iommu_set_root_entry(iommu);
1833
1834                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1835                                            0);
1836                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1837                                          0);
1838                 iommu_disable_protect_mem_regions(iommu);
1839
1840                 ret = iommu_enable_translation(iommu);
1841                 if (ret)
1842                         goto error;
1843         }
1844
1845         return 0;
1846 error:
1847         for_each_drhd_unit(drhd) {
1848                 if (drhd->ignored)
1849                         continue;
1850                 iommu = drhd->iommu;
1851                 free_iommu(iommu);
1852         }
1853         return ret;
1854 }
1855
1856 static inline u64 aligned_size(u64 host_addr, size_t size)
1857 {
1858         u64 addr;
1859         addr = (host_addr & (~PAGE_MASK)) + size;
1860         return PAGE_ALIGN(addr);
1861 }
1862
1863 struct iova *
1864 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1865 {
1866         struct iova *piova;
1867
1868         /* Make sure it's in range */
1869         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1870         if (!size || (IOVA_START_ADDR + size > end))
1871                 return NULL;
1872
1873         piova = alloc_iova(&domain->iovad,
1874                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1875         return piova;
1876 }
1877
1878 static struct iova *
1879 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1880                    size_t size, u64 dma_mask)
1881 {
1882         struct pci_dev *pdev = to_pci_dev(dev);
1883         struct iova *iova = NULL;
1884
1885         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1886                 iova = iommu_alloc_iova(domain, size, dma_mask);
1887         else {
1888                 /*
1889                  * First try to allocate an io virtual address in
1890                  * DMA_32BIT_MASK and if that fails then try allocating
1891                  * from higher range
1892                  */
1893                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1894                 if (!iova)
1895                         iova = iommu_alloc_iova(domain, size, dma_mask);
1896         }
1897
1898         if (!iova) {
1899                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1900                 return NULL;
1901         }
1902
1903         return iova;
1904 }
1905
1906 static struct dmar_domain *
1907 get_valid_domain_for_dev(struct pci_dev *pdev)
1908 {
1909         struct dmar_domain *domain;
1910         int ret;
1911
1912         domain = get_domain_for_dev(pdev,
1913                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1914         if (!domain) {
1915                 printk(KERN_ERR
1916                         "Allocating domain for %s failed", pci_name(pdev));
1917                 return NULL;
1918         }
1919
1920         /* make sure context mapping is ok */
1921         if (unlikely(!domain_context_mapped(domain, pdev))) {
1922                 ret = domain_context_mapping(domain, pdev);
1923                 if (ret) {
1924                         printk(KERN_ERR
1925                                 "Domain context map for %s failed",
1926                                 pci_name(pdev));
1927                         return NULL;
1928                 }
1929         }
1930
1931         return domain;
1932 }
1933
1934 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1935                                      size_t size, int dir, u64 dma_mask)
1936 {
1937         struct pci_dev *pdev = to_pci_dev(hwdev);
1938         struct dmar_domain *domain;
1939         phys_addr_t start_paddr;
1940         struct iova *iova;
1941         int prot = 0;
1942         int ret;
1943
1944         BUG_ON(dir == DMA_NONE);
1945         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1946                 return paddr;
1947
1948         domain = get_valid_domain_for_dev(pdev);
1949         if (!domain)
1950                 return 0;
1951
1952         size = aligned_size((u64)paddr, size);
1953
1954         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1955         if (!iova)
1956                 goto error;
1957
1958         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1959
1960         /*
1961          * Check if DMAR supports zero-length reads on write only
1962          * mappings..
1963          */
1964         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1965                         !cap_zlr(domain->iommu->cap))
1966                 prot |= DMA_PTE_READ;
1967         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1968                 prot |= DMA_PTE_WRITE;
1969         /*
1970          * paddr - (paddr + size) might be partial page, we should map the whole
1971          * page.  Note: if two part of one page are separately mapped, we
1972          * might have two guest_addr mapping to the same host paddr, but this
1973          * is not a big problem
1974          */
1975         ret = domain_page_mapping(domain, start_paddr,
1976                 ((u64)paddr) & PAGE_MASK, size, prot);
1977         if (ret)
1978                 goto error;
1979
1980         /* it's a non-present to present mapping */
1981         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1982                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
1983         if (ret)
1984                 iommu_flush_write_buffer(domain->iommu);
1985
1986         return start_paddr + ((u64)paddr & (~PAGE_MASK));
1987
1988 error:
1989         if (iova)
1990                 __free_iova(&domain->iovad, iova);
1991         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1992                 pci_name(pdev), size, (unsigned long long)paddr, dir);
1993         return 0;
1994 }
1995
1996 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
1997                             size_t size, int dir)
1998 {
1999         return __intel_map_single(hwdev, paddr, size, dir,
2000                                   to_pci_dev(hwdev)->dma_mask);
2001 }
2002
2003 static void flush_unmaps(void)
2004 {
2005         int i, j;
2006
2007         timer_on = 0;
2008
2009         /* just flush them all */
2010         for (i = 0; i < g_num_of_iommus; i++) {
2011                 if (deferred_flush[i].next) {
2012                         struct intel_iommu *iommu =
2013                                 deferred_flush[i].domain[0]->iommu;
2014
2015                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2016                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2017                         for (j = 0; j < deferred_flush[i].next; j++) {
2018                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2019                                                 deferred_flush[i].iova[j]);
2020                         }
2021                         deferred_flush[i].next = 0;
2022                 }
2023         }
2024
2025         list_size = 0;
2026 }
2027
2028 static void flush_unmaps_timeout(unsigned long data)
2029 {
2030         unsigned long flags;
2031
2032         spin_lock_irqsave(&async_umap_flush_lock, flags);
2033         flush_unmaps();
2034         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2035 }
2036
2037 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2038 {
2039         unsigned long flags;
2040         int next, iommu_id;
2041
2042         spin_lock_irqsave(&async_umap_flush_lock, flags);
2043         if (list_size == HIGH_WATER_MARK)
2044                 flush_unmaps();
2045
2046         iommu_id = dom->iommu->seq_id;
2047
2048         next = deferred_flush[iommu_id].next;
2049         deferred_flush[iommu_id].domain[next] = dom;
2050         deferred_flush[iommu_id].iova[next] = iova;
2051         deferred_flush[iommu_id].next++;
2052
2053         if (!timer_on) {
2054                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2055                 timer_on = 1;
2056         }
2057         list_size++;
2058         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2059 }
2060
2061 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2062                         int dir)
2063 {
2064         struct pci_dev *pdev = to_pci_dev(dev);
2065         struct dmar_domain *domain;
2066         unsigned long start_addr;
2067         struct iova *iova;
2068
2069         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2070                 return;
2071         domain = find_domain(pdev);
2072         BUG_ON(!domain);
2073
2074         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2075         if (!iova)
2076                 return;
2077
2078         start_addr = iova->pfn_lo << PAGE_SHIFT;
2079         size = aligned_size((u64)dev_addr, size);
2080
2081         pr_debug("Device %s unmapping: %lx@%llx\n",
2082                 pci_name(pdev), size, (unsigned long long)start_addr);
2083
2084         /*  clear the whole page */
2085         dma_pte_clear_range(domain, start_addr, start_addr + size);
2086         /* free page tables */
2087         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2088         if (intel_iommu_strict) {
2089                 if (iommu_flush_iotlb_psi(domain->iommu,
2090                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2091                         iommu_flush_write_buffer(domain->iommu);
2092                 /* free iova */
2093                 __free_iova(&domain->iovad, iova);
2094         } else {
2095                 add_unmap(domain, iova);
2096                 /*
2097                  * queue up the release of the unmap to save the 1/6th of the
2098                  * cpu used up by the iotlb flush operation...
2099                  */
2100         }
2101 }
2102
2103 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2104                            dma_addr_t *dma_handle, gfp_t flags)
2105 {
2106         void *vaddr;
2107         int order;
2108
2109         size = PAGE_ALIGN(size);
2110         order = get_order(size);
2111         flags &= ~(GFP_DMA | GFP_DMA32);
2112
2113         vaddr = (void *)__get_free_pages(flags, order);
2114         if (!vaddr)
2115                 return NULL;
2116         memset(vaddr, 0, size);
2117
2118         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2119                                          DMA_BIDIRECTIONAL,
2120                                          hwdev->coherent_dma_mask);
2121         if (*dma_handle)
2122                 return vaddr;
2123         free_pages((unsigned long)vaddr, order);
2124         return NULL;
2125 }
2126
2127 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2128                          dma_addr_t dma_handle)
2129 {
2130         int order;
2131
2132         size = PAGE_ALIGN(size);
2133         order = get_order(size);
2134
2135         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2136         free_pages((unsigned long)vaddr, order);
2137 }
2138
2139 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2140
2141 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2142                     int nelems, int dir)
2143 {
2144         int i;
2145         struct pci_dev *pdev = to_pci_dev(hwdev);
2146         struct dmar_domain *domain;
2147         unsigned long start_addr;
2148         struct iova *iova;
2149         size_t size = 0;
2150         void *addr;
2151         struct scatterlist *sg;
2152
2153         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2154                 return;
2155
2156         domain = find_domain(pdev);
2157
2158         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2159         if (!iova)
2160                 return;
2161         for_each_sg(sglist, sg, nelems, i) {
2162                 addr = SG_ENT_VIRT_ADDRESS(sg);
2163                 size += aligned_size((u64)addr, sg->length);
2164         }
2165
2166         start_addr = iova->pfn_lo << PAGE_SHIFT;
2167
2168         /*  clear the whole page */
2169         dma_pte_clear_range(domain, start_addr, start_addr + size);
2170         /* free page tables */
2171         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2172
2173         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2174                         size >> VTD_PAGE_SHIFT, 0))
2175                 iommu_flush_write_buffer(domain->iommu);
2176
2177         /* free iova */
2178         __free_iova(&domain->iovad, iova);
2179 }
2180
2181 static int intel_nontranslate_map_sg(struct device *hddev,
2182         struct scatterlist *sglist, int nelems, int dir)
2183 {
2184         int i;
2185         struct scatterlist *sg;
2186
2187         for_each_sg(sglist, sg, nelems, i) {
2188                 BUG_ON(!sg_page(sg));
2189                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2190                 sg->dma_length = sg->length;
2191         }
2192         return nelems;
2193 }
2194
2195 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2196                  int dir)
2197 {
2198         void *addr;
2199         int i;
2200         struct pci_dev *pdev = to_pci_dev(hwdev);
2201         struct dmar_domain *domain;
2202         size_t size = 0;
2203         int prot = 0;
2204         size_t offset = 0;
2205         struct iova *iova = NULL;
2206         int ret;
2207         struct scatterlist *sg;
2208         unsigned long start_addr;
2209
2210         BUG_ON(dir == DMA_NONE);
2211         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2212                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2213
2214         domain = get_valid_domain_for_dev(pdev);
2215         if (!domain)
2216                 return 0;
2217
2218         for_each_sg(sglist, sg, nelems, i) {
2219                 addr = SG_ENT_VIRT_ADDRESS(sg);
2220                 addr = (void *)virt_to_phys(addr);
2221                 size += aligned_size((u64)addr, sg->length);
2222         }
2223
2224         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2225         if (!iova) {
2226                 sglist->dma_length = 0;
2227                 return 0;
2228         }
2229
2230         /*
2231          * Check if DMAR supports zero-length reads on write only
2232          * mappings..
2233          */
2234         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2235                         !cap_zlr(domain->iommu->cap))
2236                 prot |= DMA_PTE_READ;
2237         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2238                 prot |= DMA_PTE_WRITE;
2239
2240         start_addr = iova->pfn_lo << PAGE_SHIFT;
2241         offset = 0;
2242         for_each_sg(sglist, sg, nelems, i) {
2243                 addr = SG_ENT_VIRT_ADDRESS(sg);
2244                 addr = (void *)virt_to_phys(addr);
2245                 size = aligned_size((u64)addr, sg->length);
2246                 ret = domain_page_mapping(domain, start_addr + offset,
2247                         ((u64)addr) & PAGE_MASK,
2248                         size, prot);
2249                 if (ret) {
2250                         /*  clear the page */
2251                         dma_pte_clear_range(domain, start_addr,
2252                                   start_addr + offset);
2253                         /* free page tables */
2254                         dma_pte_free_pagetable(domain, start_addr,
2255                                   start_addr + offset);
2256                         /* free iova */
2257                         __free_iova(&domain->iovad, iova);
2258                         return 0;
2259                 }
2260                 sg->dma_address = start_addr + offset +
2261                                 ((u64)addr & (~PAGE_MASK));
2262                 sg->dma_length = sg->length;
2263                 offset += size;
2264         }
2265
2266         /* it's a non-present to present mapping */
2267         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2268                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2269                 iommu_flush_write_buffer(domain->iommu);
2270         return nelems;
2271 }
2272
2273 static struct dma_mapping_ops intel_dma_ops = {
2274         .alloc_coherent = intel_alloc_coherent,
2275         .free_coherent = intel_free_coherent,
2276         .map_single = intel_map_single,
2277         .unmap_single = intel_unmap_single,
2278         .map_sg = intel_map_sg,
2279         .unmap_sg = intel_unmap_sg,
2280 };
2281
2282 static inline int iommu_domain_cache_init(void)
2283 {
2284         int ret = 0;
2285
2286         iommu_domain_cache = kmem_cache_create("iommu_domain",
2287                                          sizeof(struct dmar_domain),
2288                                          0,
2289                                          SLAB_HWCACHE_ALIGN,
2290
2291                                          NULL);
2292         if (!iommu_domain_cache) {
2293                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2294                 ret = -ENOMEM;
2295         }
2296
2297         return ret;
2298 }
2299
2300 static inline int iommu_devinfo_cache_init(void)
2301 {
2302         int ret = 0;
2303
2304         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2305                                          sizeof(struct device_domain_info),
2306                                          0,
2307                                          SLAB_HWCACHE_ALIGN,
2308                                          NULL);
2309         if (!iommu_devinfo_cache) {
2310                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2311                 ret = -ENOMEM;
2312         }
2313
2314         return ret;
2315 }
2316
2317 static inline int iommu_iova_cache_init(void)
2318 {
2319         int ret = 0;
2320
2321         iommu_iova_cache = kmem_cache_create("iommu_iova",
2322                                          sizeof(struct iova),
2323                                          0,
2324                                          SLAB_HWCACHE_ALIGN,
2325                                          NULL);
2326         if (!iommu_iova_cache) {
2327                 printk(KERN_ERR "Couldn't create iova cache\n");
2328                 ret = -ENOMEM;
2329         }
2330
2331         return ret;
2332 }
2333
2334 static int __init iommu_init_mempool(void)
2335 {
2336         int ret;
2337         ret = iommu_iova_cache_init();
2338         if (ret)
2339                 return ret;
2340
2341         ret = iommu_domain_cache_init();
2342         if (ret)
2343                 goto domain_error;
2344
2345         ret = iommu_devinfo_cache_init();
2346         if (!ret)
2347                 return ret;
2348
2349         kmem_cache_destroy(iommu_domain_cache);
2350 domain_error:
2351         kmem_cache_destroy(iommu_iova_cache);
2352
2353         return -ENOMEM;
2354 }
2355
2356 static void __init iommu_exit_mempool(void)
2357 {
2358         kmem_cache_destroy(iommu_devinfo_cache);
2359         kmem_cache_destroy(iommu_domain_cache);
2360         kmem_cache_destroy(iommu_iova_cache);
2361
2362 }
2363
2364 static void __init init_no_remapping_devices(void)
2365 {
2366         struct dmar_drhd_unit *drhd;
2367
2368         for_each_drhd_unit(drhd) {
2369                 if (!drhd->include_all) {
2370                         int i;
2371                         for (i = 0; i < drhd->devices_cnt; i++)
2372                                 if (drhd->devices[i] != NULL)
2373                                         break;
2374                         /* ignore DMAR unit if no pci devices exist */
2375                         if (i == drhd->devices_cnt)
2376                                 drhd->ignored = 1;
2377                 }
2378         }
2379
2380         if (dmar_map_gfx)
2381                 return;
2382
2383         for_each_drhd_unit(drhd) {
2384                 int i;
2385                 if (drhd->ignored || drhd->include_all)
2386                         continue;
2387
2388                 for (i = 0; i < drhd->devices_cnt; i++)
2389                         if (drhd->devices[i] &&
2390                                 !IS_GFX_DEVICE(drhd->devices[i]))
2391                                 break;
2392
2393                 if (i < drhd->devices_cnt)
2394                         continue;
2395
2396                 /* bypass IOMMU if it is just for gfx devices */
2397                 drhd->ignored = 1;
2398                 for (i = 0; i < drhd->devices_cnt; i++) {
2399                         if (!drhd->devices[i])
2400                                 continue;
2401                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2402                 }
2403         }
2404 }
2405
2406 int __init intel_iommu_init(void)
2407 {
2408         int ret = 0;
2409
2410         if (dmar_table_init())
2411                 return  -ENODEV;
2412
2413         if (dmar_dev_scope_init())
2414                 return  -ENODEV;
2415
2416         /*
2417          * Check the need for DMA-remapping initialization now.
2418          * Above initialization will also be used by Interrupt-remapping.
2419          */
2420         if (no_iommu || swiotlb || dmar_disabled)
2421                 return -ENODEV;
2422
2423         iommu_init_mempool();
2424         dmar_init_reserved_ranges();
2425
2426         init_no_remapping_devices();
2427
2428         ret = init_dmars();
2429         if (ret) {
2430                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2431                 put_iova_domain(&reserved_iova_list);
2432                 iommu_exit_mempool();
2433                 return ret;
2434         }
2435         printk(KERN_INFO
2436         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2437
2438         init_timer(&unmap_timer);
2439         force_iommu = 1;
2440         dma_ops = &intel_dma_ops;
2441         return 0;
2442 }
2443
2444 void intel_iommu_domain_exit(struct dmar_domain *domain)
2445 {
2446         u64 end;
2447
2448         /* Domain 0 is reserved, so dont process it */
2449         if (!domain)
2450                 return;
2451
2452         end = DOMAIN_MAX_ADDR(domain->gaw);
2453         end = end & (~VTD_PAGE_MASK);
2454
2455         /* clear ptes */
2456         dma_pte_clear_range(domain, 0, end);
2457
2458         /* free page tables */
2459         dma_pte_free_pagetable(domain, 0, end);
2460
2461         iommu_free_domain(domain);
2462         free_domain_mem(domain);
2463 }
2464 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2465
2466 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2467 {
2468         struct dmar_drhd_unit *drhd;
2469         struct dmar_domain *domain;
2470         struct intel_iommu *iommu;
2471
2472         drhd = dmar_find_matched_drhd_unit(pdev);
2473         if (!drhd) {
2474                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2475                 return NULL;
2476         }
2477
2478         iommu = drhd->iommu;
2479         if (!iommu) {
2480                 printk(KERN_ERR
2481                         "intel_iommu_domain_alloc: iommu == NULL\n");
2482                 return NULL;
2483         }
2484         domain = iommu_alloc_domain(iommu);
2485         if (!domain) {
2486                 printk(KERN_ERR
2487                         "intel_iommu_domain_alloc: domain == NULL\n");
2488                 return NULL;
2489         }
2490         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2491                 printk(KERN_ERR
2492                         "intel_iommu_domain_alloc: domain_init() failed\n");
2493                 intel_iommu_domain_exit(domain);
2494                 return NULL;
2495         }
2496         return domain;
2497 }
2498 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2499
2500 int intel_iommu_context_mapping(
2501         struct dmar_domain *domain, struct pci_dev *pdev)
2502 {
2503         int rc;
2504         rc = domain_context_mapping(domain, pdev);
2505         return rc;
2506 }
2507 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2508
2509 int intel_iommu_page_mapping(
2510         struct dmar_domain *domain, dma_addr_t iova,
2511         u64 hpa, size_t size, int prot)
2512 {
2513         int rc;
2514         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2515         return rc;
2516 }
2517 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2518
2519 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2520 {
2521         detach_domain_for_dev(domain, bus, devfn);
2522 }
2523 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2524
2525 struct dmar_domain *
2526 intel_iommu_find_domain(struct pci_dev *pdev)
2527 {
2528         return find_domain(pdev);
2529 }
2530 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2531
2532 int intel_iommu_found(void)
2533 {
2534         return g_num_of_iommus;
2535 }
2536 EXPORT_SYMBOL_GPL(intel_iommu_found);
2537
2538 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2539 {
2540         struct dma_pte *pte;
2541         u64 pfn;
2542
2543         pfn = 0;
2544         pte = addr_to_dma_pte(domain, iova);
2545
2546         if (pte)
2547                 pfn = dma_pte_addr(*pte);
2548
2549         return pfn >> VTD_PAGE_SHIFT;
2550 }
2551 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);