intel-iommu: trivially inline DMA PTE macros
[firefly-linux-kernel-4.4.55.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /*
61  * 0: Present
62  * 1-11: Reserved
63  * 12-63: Context Ptr (12 - (haw-1))
64  * 64-127: Reserved
65  */
66 struct root_entry {
67         u64     val;
68         u64     rsvd1;
69 };
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
72 {
73         return (root->val & 1);
74 }
75 static inline void set_root_present(struct root_entry *root)
76 {
77         root->val |= 1;
78 }
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
80 {
81         root->val |= value & VTD_PAGE_MASK;
82 }
83
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
86 {
87         return (struct context_entry *)
88                 (root_present(root)?phys_to_virt(
89                 root->val & VTD_PAGE_MASK) :
90                 NULL);
91 }
92
93 /*
94  * low 64 bits:
95  * 0: present
96  * 1: fault processing disable
97  * 2-3: translation type
98  * 12-63: address space root
99  * high 64 bits:
100  * 0-2: address width
101  * 3-6: aval
102  * 8-23: domain id
103  */
104 struct context_entry {
105         u64 lo;
106         u64 hi;
107 };
108
109 static inline bool context_present(struct context_entry *context)
110 {
111         return (context->lo & 1);
112 }
113 static inline void context_set_present(struct context_entry *context)
114 {
115         context->lo |= 1;
116 }
117
118 static inline void context_set_fault_enable(struct context_entry *context)
119 {
120         context->lo &= (((u64)-1) << 2) | 1;
121 }
122
123 #define CONTEXT_TT_MULTI_LEVEL 0
124
125 static inline void context_set_translation_type(struct context_entry *context,
126                                                 unsigned long value)
127 {
128         context->lo &= (((u64)-1) << 4) | 3;
129         context->lo |= (value & 3) << 2;
130 }
131
132 static inline void context_set_address_root(struct context_entry *context,
133                                             unsigned long value)
134 {
135         context->lo |= value & VTD_PAGE_MASK;
136 }
137
138 static inline void context_set_address_width(struct context_entry *context,
139                                              unsigned long value)
140 {
141         context->hi |= value & 7;
142 }
143
144 static inline void context_set_domain_id(struct context_entry *context,
145                                          unsigned long value)
146 {
147         context->hi |= (value & ((1 << 16) - 1)) << 8;
148 }
149
150 static inline void context_clear_entry(struct context_entry *context)
151 {
152         context->lo = 0;
153         context->hi = 0;
154 }
155
156 /*
157  * 0: readable
158  * 1: writable
159  * 2-6: reserved
160  * 7: super page
161  * 8-11: available
162  * 12-63: Host physcial address
163  */
164 struct dma_pte {
165         u64 val;
166 };
167
168 static inline void dma_clear_pte(struct dma_pte *pte)
169 {
170         pte->val = 0;
171 }
172
173 static inline void dma_set_pte_readable(struct dma_pte *pte)
174 {
175         pte->val |= DMA_PTE_READ;
176 }
177
178 static inline void dma_set_pte_writable(struct dma_pte *pte)
179 {
180         pte->val |= DMA_PTE_WRITE;
181 }
182
183 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
184 {
185         pte->val = (pte->val & ~3) | (prot & 3);
186 }
187
188 static inline u64 dma_pte_addr(struct dma_pte *pte)
189 {
190         return (pte->val & VTD_PAGE_MASK);
191 }
192
193 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
194 {
195         pte->val |= (addr & VTD_PAGE_MASK);
196 }
197
198 static inline bool dma_pte_present(struct dma_pte *pte)
199 {
200         return (pte->val & 3) != 0;
201 }
202
203 struct dmar_domain {
204         int     id;                     /* domain id */
205         struct intel_iommu *iommu;      /* back pointer to owning iommu */
206
207         struct list_head devices;       /* all devices' list */
208         struct iova_domain iovad;       /* iova's that belong to this domain */
209
210         struct dma_pte  *pgd;           /* virtual address */
211         spinlock_t      mapping_lock;   /* page table lock */
212         int             gaw;            /* max guest address width */
213
214         /* adjusted guest address width, 0 is level 2 30-bit */
215         int             agaw;
216
217 #define DOMAIN_FLAG_MULTIPLE_DEVICES 1
218         int             flags;
219 };
220
221 /* PCI domain-device relationship */
222 struct device_domain_info {
223         struct list_head link;  /* link to domain siblings */
224         struct list_head global; /* link to global list */
225         u8 bus;                 /* PCI bus numer */
226         u8 devfn;               /* PCI devfn number */
227         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
228         struct dmar_domain *domain; /* pointer to domain */
229 };
230
231 static void flush_unmaps_timeout(unsigned long data);
232
233 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
234
235 #define HIGH_WATER_MARK 250
236 struct deferred_flush_tables {
237         int next;
238         struct iova *iova[HIGH_WATER_MARK];
239         struct dmar_domain *domain[HIGH_WATER_MARK];
240 };
241
242 static struct deferred_flush_tables *deferred_flush;
243
244 /* bitmap for indexing intel_iommus */
245 static int g_num_of_iommus;
246
247 static DEFINE_SPINLOCK(async_umap_flush_lock);
248 static LIST_HEAD(unmaps_to_do);
249
250 static int timer_on;
251 static long list_size;
252
253 static void domain_remove_dev_info(struct dmar_domain *domain);
254
255 int dmar_disabled;
256 static int __initdata dmar_map_gfx = 1;
257 static int dmar_forcedac;
258 static int intel_iommu_strict;
259
260 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
261 static DEFINE_SPINLOCK(device_domain_lock);
262 static LIST_HEAD(device_domain_list);
263
264 static int __init intel_iommu_setup(char *str)
265 {
266         if (!str)
267                 return -EINVAL;
268         while (*str) {
269                 if (!strncmp(str, "off", 3)) {
270                         dmar_disabled = 1;
271                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
272                 } else if (!strncmp(str, "igfx_off", 8)) {
273                         dmar_map_gfx = 0;
274                         printk(KERN_INFO
275                                 "Intel-IOMMU: disable GFX device mapping\n");
276                 } else if (!strncmp(str, "forcedac", 8)) {
277                         printk(KERN_INFO
278                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
279                         dmar_forcedac = 1;
280                 } else if (!strncmp(str, "strict", 6)) {
281                         printk(KERN_INFO
282                                 "Intel-IOMMU: disable batched IOTLB flush\n");
283                         intel_iommu_strict = 1;
284                 }
285
286                 str += strcspn(str, ",");
287                 while (*str == ',')
288                         str++;
289         }
290         return 0;
291 }
292 __setup("intel_iommu=", intel_iommu_setup);
293
294 static struct kmem_cache *iommu_domain_cache;
295 static struct kmem_cache *iommu_devinfo_cache;
296 static struct kmem_cache *iommu_iova_cache;
297
298 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
299 {
300         unsigned int flags;
301         void *vaddr;
302
303         /* trying to avoid low memory issues */
304         flags = current->flags & PF_MEMALLOC;
305         current->flags |= PF_MEMALLOC;
306         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
307         current->flags &= (~PF_MEMALLOC | flags);
308         return vaddr;
309 }
310
311
312 static inline void *alloc_pgtable_page(void)
313 {
314         unsigned int flags;
315         void *vaddr;
316
317         /* trying to avoid low memory issues */
318         flags = current->flags & PF_MEMALLOC;
319         current->flags |= PF_MEMALLOC;
320         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
321         current->flags &= (~PF_MEMALLOC | flags);
322         return vaddr;
323 }
324
325 static inline void free_pgtable_page(void *vaddr)
326 {
327         free_page((unsigned long)vaddr);
328 }
329
330 static inline void *alloc_domain_mem(void)
331 {
332         return iommu_kmem_cache_alloc(iommu_domain_cache);
333 }
334
335 static void free_domain_mem(void *vaddr)
336 {
337         kmem_cache_free(iommu_domain_cache, vaddr);
338 }
339
340 static inline void * alloc_devinfo_mem(void)
341 {
342         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
343 }
344
345 static inline void free_devinfo_mem(void *vaddr)
346 {
347         kmem_cache_free(iommu_devinfo_cache, vaddr);
348 }
349
350 struct iova *alloc_iova_mem(void)
351 {
352         return iommu_kmem_cache_alloc(iommu_iova_cache);
353 }
354
355 void free_iova_mem(struct iova *iova)
356 {
357         kmem_cache_free(iommu_iova_cache, iova);
358 }
359
360 /* Gets context entry for a given bus and devfn */
361 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
362                 u8 bus, u8 devfn)
363 {
364         struct root_entry *root;
365         struct context_entry *context;
366         unsigned long phy_addr;
367         unsigned long flags;
368
369         spin_lock_irqsave(&iommu->lock, flags);
370         root = &iommu->root_entry[bus];
371         context = get_context_addr_from_root(root);
372         if (!context) {
373                 context = (struct context_entry *)alloc_pgtable_page();
374                 if (!context) {
375                         spin_unlock_irqrestore(&iommu->lock, flags);
376                         return NULL;
377                 }
378                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
379                 phy_addr = virt_to_phys((void *)context);
380                 set_root_value(root, phy_addr);
381                 set_root_present(root);
382                 __iommu_flush_cache(iommu, root, sizeof(*root));
383         }
384         spin_unlock_irqrestore(&iommu->lock, flags);
385         return &context[devfn];
386 }
387
388 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
389 {
390         struct root_entry *root;
391         struct context_entry *context;
392         int ret;
393         unsigned long flags;
394
395         spin_lock_irqsave(&iommu->lock, flags);
396         root = &iommu->root_entry[bus];
397         context = get_context_addr_from_root(root);
398         if (!context) {
399                 ret = 0;
400                 goto out;
401         }
402         ret = context_present(&context[devfn]);
403 out:
404         spin_unlock_irqrestore(&iommu->lock, flags);
405         return ret;
406 }
407
408 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
409 {
410         struct root_entry *root;
411         struct context_entry *context;
412         unsigned long flags;
413
414         spin_lock_irqsave(&iommu->lock, flags);
415         root = &iommu->root_entry[bus];
416         context = get_context_addr_from_root(root);
417         if (context) {
418                 context_clear_entry(&context[devfn]);
419                 __iommu_flush_cache(iommu, &context[devfn], \
420                         sizeof(*context));
421         }
422         spin_unlock_irqrestore(&iommu->lock, flags);
423 }
424
425 static void free_context_table(struct intel_iommu *iommu)
426 {
427         struct root_entry *root;
428         int i;
429         unsigned long flags;
430         struct context_entry *context;
431
432         spin_lock_irqsave(&iommu->lock, flags);
433         if (!iommu->root_entry) {
434                 goto out;
435         }
436         for (i = 0; i < ROOT_ENTRY_NR; i++) {
437                 root = &iommu->root_entry[i];
438                 context = get_context_addr_from_root(root);
439                 if (context)
440                         free_pgtable_page(context);
441         }
442         free_pgtable_page(iommu->root_entry);
443         iommu->root_entry = NULL;
444 out:
445         spin_unlock_irqrestore(&iommu->lock, flags);
446 }
447
448 /* page table handling */
449 #define LEVEL_STRIDE            (9)
450 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
451
452 static inline int agaw_to_level(int agaw)
453 {
454         return agaw + 2;
455 }
456
457 static inline int agaw_to_width(int agaw)
458 {
459         return 30 + agaw * LEVEL_STRIDE;
460
461 }
462
463 static inline int width_to_agaw(int width)
464 {
465         return (width - 30) / LEVEL_STRIDE;
466 }
467
468 static inline unsigned int level_to_offset_bits(int level)
469 {
470         return (12 + (level - 1) * LEVEL_STRIDE);
471 }
472
473 static inline int address_level_offset(u64 addr, int level)
474 {
475         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
476 }
477
478 static inline u64 level_mask(int level)
479 {
480         return ((u64)-1 << level_to_offset_bits(level));
481 }
482
483 static inline u64 level_size(int level)
484 {
485         return ((u64)1 << level_to_offset_bits(level));
486 }
487
488 static inline u64 align_to_level(u64 addr, int level)
489 {
490         return ((addr + level_size(level) - 1) & level_mask(level));
491 }
492
493 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
494 {
495         int addr_width = agaw_to_width(domain->agaw);
496         struct dma_pte *parent, *pte = NULL;
497         int level = agaw_to_level(domain->agaw);
498         int offset;
499         unsigned long flags;
500
501         BUG_ON(!domain->pgd);
502
503         addr &= (((u64)1) << addr_width) - 1;
504         parent = domain->pgd;
505
506         spin_lock_irqsave(&domain->mapping_lock, flags);
507         while (level > 0) {
508                 void *tmp_page;
509
510                 offset = address_level_offset(addr, level);
511                 pte = &parent[offset];
512                 if (level == 1)
513                         break;
514
515                 if (!dma_pte_present(pte)) {
516                         tmp_page = alloc_pgtable_page();
517
518                         if (!tmp_page) {
519                                 spin_unlock_irqrestore(&domain->mapping_lock,
520                                         flags);
521                                 return NULL;
522                         }
523                         __iommu_flush_cache(domain->iommu, tmp_page,
524                                         PAGE_SIZE);
525                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
526                         /*
527                          * high level table always sets r/w, last level page
528                          * table control read/write
529                          */
530                         dma_set_pte_readable(pte);
531                         dma_set_pte_writable(pte);
532                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
533                 }
534                 parent = phys_to_virt(dma_pte_addr(pte));
535                 level--;
536         }
537
538         spin_unlock_irqrestore(&domain->mapping_lock, flags);
539         return pte;
540 }
541
542 /* return address's pte at specific level */
543 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
544                 int level)
545 {
546         struct dma_pte *parent, *pte = NULL;
547         int total = agaw_to_level(domain->agaw);
548         int offset;
549
550         parent = domain->pgd;
551         while (level <= total) {
552                 offset = address_level_offset(addr, total);
553                 pte = &parent[offset];
554                 if (level == total)
555                         return pte;
556
557                 if (!dma_pte_present(pte))
558                         break;
559                 parent = phys_to_virt(dma_pte_addr(pte));
560                 total--;
561         }
562         return NULL;
563 }
564
565 /* clear one page's page table */
566 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
567 {
568         struct dma_pte *pte = NULL;
569
570         /* get last level pte */
571         pte = dma_addr_level_pte(domain, addr, 1);
572
573         if (pte) {
574                 dma_clear_pte(pte);
575                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
576         }
577 }
578
579 /* clear last level pte, a tlb flush should be followed */
580 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
581 {
582         int addr_width = agaw_to_width(domain->agaw);
583
584         start &= (((u64)1) << addr_width) - 1;
585         end &= (((u64)1) << addr_width) - 1;
586         /* in case it's partial page */
587         start = PAGE_ALIGN(start);
588         end &= PAGE_MASK;
589
590         /* we don't need lock here, nobody else touches the iova range */
591         while (start < end) {
592                 dma_pte_clear_one(domain, start);
593                 start += VTD_PAGE_SIZE;
594         }
595 }
596
597 /* free page table pages. last level pte should already be cleared */
598 static void dma_pte_free_pagetable(struct dmar_domain *domain,
599         u64 start, u64 end)
600 {
601         int addr_width = agaw_to_width(domain->agaw);
602         struct dma_pte *pte;
603         int total = agaw_to_level(domain->agaw);
604         int level;
605         u64 tmp;
606
607         start &= (((u64)1) << addr_width) - 1;
608         end &= (((u64)1) << addr_width) - 1;
609
610         /* we don't need lock here, nobody else touches the iova range */
611         level = 2;
612         while (level <= total) {
613                 tmp = align_to_level(start, level);
614                 if (tmp >= end || (tmp + level_size(level) > end))
615                         return;
616
617                 while (tmp < end) {
618                         pte = dma_addr_level_pte(domain, tmp, level);
619                         if (pte) {
620                                 free_pgtable_page(
621                                         phys_to_virt(dma_pte_addr(pte)));
622                                 dma_clear_pte(pte);
623                                 __iommu_flush_cache(domain->iommu,
624                                                 pte, sizeof(*pte));
625                         }
626                         tmp += level_size(level);
627                 }
628                 level++;
629         }
630         /* free pgd */
631         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
632                 free_pgtable_page(domain->pgd);
633                 domain->pgd = NULL;
634         }
635 }
636
637 /* iommu handling */
638 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
639 {
640         struct root_entry *root;
641         unsigned long flags;
642
643         root = (struct root_entry *)alloc_pgtable_page();
644         if (!root)
645                 return -ENOMEM;
646
647         __iommu_flush_cache(iommu, root, ROOT_SIZE);
648
649         spin_lock_irqsave(&iommu->lock, flags);
650         iommu->root_entry = root;
651         spin_unlock_irqrestore(&iommu->lock, flags);
652
653         return 0;
654 }
655
656 static void iommu_set_root_entry(struct intel_iommu *iommu)
657 {
658         void *addr;
659         u32 cmd, sts;
660         unsigned long flag;
661
662         addr = iommu->root_entry;
663
664         spin_lock_irqsave(&iommu->register_lock, flag);
665         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
666
667         cmd = iommu->gcmd | DMA_GCMD_SRTP;
668         writel(cmd, iommu->reg + DMAR_GCMD_REG);
669
670         /* Make sure hardware complete it */
671         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
672                 readl, (sts & DMA_GSTS_RTPS), sts);
673
674         spin_unlock_irqrestore(&iommu->register_lock, flag);
675 }
676
677 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
678 {
679         u32 val;
680         unsigned long flag;
681
682         if (!cap_rwbf(iommu->cap))
683                 return;
684         val = iommu->gcmd | DMA_GCMD_WBF;
685
686         spin_lock_irqsave(&iommu->register_lock, flag);
687         writel(val, iommu->reg + DMAR_GCMD_REG);
688
689         /* Make sure hardware complete it */
690         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
691                         readl, (!(val & DMA_GSTS_WBFS)), val);
692
693         spin_unlock_irqrestore(&iommu->register_lock, flag);
694 }
695
696 /* return value determine if we need a write buffer flush */
697 static int __iommu_flush_context(struct intel_iommu *iommu,
698         u16 did, u16 source_id, u8 function_mask, u64 type,
699         int non_present_entry_flush)
700 {
701         u64 val = 0;
702         unsigned long flag;
703
704         /*
705          * In the non-present entry flush case, if hardware doesn't cache
706          * non-present entry we do nothing and if hardware cache non-present
707          * entry, we flush entries of domain 0 (the domain id is used to cache
708          * any non-present entries)
709          */
710         if (non_present_entry_flush) {
711                 if (!cap_caching_mode(iommu->cap))
712                         return 1;
713                 else
714                         did = 0;
715         }
716
717         switch (type) {
718         case DMA_CCMD_GLOBAL_INVL:
719                 val = DMA_CCMD_GLOBAL_INVL;
720                 break;
721         case DMA_CCMD_DOMAIN_INVL:
722                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
723                 break;
724         case DMA_CCMD_DEVICE_INVL:
725                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
726                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
727                 break;
728         default:
729                 BUG();
730         }
731         val |= DMA_CCMD_ICC;
732
733         spin_lock_irqsave(&iommu->register_lock, flag);
734         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
735
736         /* Make sure hardware complete it */
737         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
738                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
739
740         spin_unlock_irqrestore(&iommu->register_lock, flag);
741
742         /* flush context entry will implicitly flush write buffer */
743         return 0;
744 }
745
746 /* return value determine if we need a write buffer flush */
747 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
748         u64 addr, unsigned int size_order, u64 type,
749         int non_present_entry_flush)
750 {
751         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
752         u64 val = 0, val_iva = 0;
753         unsigned long flag;
754
755         /*
756          * In the non-present entry flush case, if hardware doesn't cache
757          * non-present entry we do nothing and if hardware cache non-present
758          * entry, we flush entries of domain 0 (the domain id is used to cache
759          * any non-present entries)
760          */
761         if (non_present_entry_flush) {
762                 if (!cap_caching_mode(iommu->cap))
763                         return 1;
764                 else
765                         did = 0;
766         }
767
768         switch (type) {
769         case DMA_TLB_GLOBAL_FLUSH:
770                 /* global flush doesn't need set IVA_REG */
771                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
772                 break;
773         case DMA_TLB_DSI_FLUSH:
774                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
775                 break;
776         case DMA_TLB_PSI_FLUSH:
777                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
778                 /* Note: always flush non-leaf currently */
779                 val_iva = size_order | addr;
780                 break;
781         default:
782                 BUG();
783         }
784         /* Note: set drain read/write */
785 #if 0
786         /*
787          * This is probably to be super secure.. Looks like we can
788          * ignore it without any impact.
789          */
790         if (cap_read_drain(iommu->cap))
791                 val |= DMA_TLB_READ_DRAIN;
792 #endif
793         if (cap_write_drain(iommu->cap))
794                 val |= DMA_TLB_WRITE_DRAIN;
795
796         spin_lock_irqsave(&iommu->register_lock, flag);
797         /* Note: Only uses first TLB reg currently */
798         if (val_iva)
799                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
800         dmar_writeq(iommu->reg + tlb_offset + 8, val);
801
802         /* Make sure hardware complete it */
803         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
804                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
805
806         spin_unlock_irqrestore(&iommu->register_lock, flag);
807
808         /* check IOTLB invalidation granularity */
809         if (DMA_TLB_IAIG(val) == 0)
810                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
811         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
812                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
813                         (unsigned long long)DMA_TLB_IIRG(type),
814                         (unsigned long long)DMA_TLB_IAIG(val));
815         /* flush iotlb entry will implicitly flush write buffer */
816         return 0;
817 }
818
819 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
820         u64 addr, unsigned int pages, int non_present_entry_flush)
821 {
822         unsigned int mask;
823
824         BUG_ON(addr & (~VTD_PAGE_MASK));
825         BUG_ON(pages == 0);
826
827         /* Fallback to domain selective flush if no PSI support */
828         if (!cap_pgsel_inv(iommu->cap))
829                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
830                                                 DMA_TLB_DSI_FLUSH,
831                                                 non_present_entry_flush);
832
833         /*
834          * PSI requires page size to be 2 ^ x, and the base address is naturally
835          * aligned to the size
836          */
837         mask = ilog2(__roundup_pow_of_two(pages));
838         /* Fallback to domain selective flush if size is too big */
839         if (mask > cap_max_amask_val(iommu->cap))
840                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
841                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
842
843         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
844                                         DMA_TLB_PSI_FLUSH,
845                                         non_present_entry_flush);
846 }
847
848 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
849 {
850         u32 pmen;
851         unsigned long flags;
852
853         spin_lock_irqsave(&iommu->register_lock, flags);
854         pmen = readl(iommu->reg + DMAR_PMEN_REG);
855         pmen &= ~DMA_PMEN_EPM;
856         writel(pmen, iommu->reg + DMAR_PMEN_REG);
857
858         /* wait for the protected region status bit to clear */
859         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
860                 readl, !(pmen & DMA_PMEN_PRS), pmen);
861
862         spin_unlock_irqrestore(&iommu->register_lock, flags);
863 }
864
865 static int iommu_enable_translation(struct intel_iommu *iommu)
866 {
867         u32 sts;
868         unsigned long flags;
869
870         spin_lock_irqsave(&iommu->register_lock, flags);
871         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
872
873         /* Make sure hardware complete it */
874         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
875                 readl, (sts & DMA_GSTS_TES), sts);
876
877         iommu->gcmd |= DMA_GCMD_TE;
878         spin_unlock_irqrestore(&iommu->register_lock, flags);
879         return 0;
880 }
881
882 static int iommu_disable_translation(struct intel_iommu *iommu)
883 {
884         u32 sts;
885         unsigned long flag;
886
887         spin_lock_irqsave(&iommu->register_lock, flag);
888         iommu->gcmd &= ~DMA_GCMD_TE;
889         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
890
891         /* Make sure hardware complete it */
892         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
893                 readl, (!(sts & DMA_GSTS_TES)), sts);
894
895         spin_unlock_irqrestore(&iommu->register_lock, flag);
896         return 0;
897 }
898
899 /* iommu interrupt handling. Most stuff are MSI-like. */
900
901 static const char *fault_reason_strings[] =
902 {
903         "Software",
904         "Present bit in root entry is clear",
905         "Present bit in context entry is clear",
906         "Invalid context entry",
907         "Access beyond MGAW",
908         "PTE Write access is not set",
909         "PTE Read access is not set",
910         "Next page table ptr is invalid",
911         "Root table address invalid",
912         "Context table ptr is invalid",
913         "non-zero reserved fields in RTP",
914         "non-zero reserved fields in CTP",
915         "non-zero reserved fields in PTE",
916 };
917 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
918
919 const char *dmar_get_fault_reason(u8 fault_reason)
920 {
921         if (fault_reason > MAX_FAULT_REASON_IDX)
922                 return "Unknown";
923         else
924                 return fault_reason_strings[fault_reason];
925 }
926
927 void dmar_msi_unmask(unsigned int irq)
928 {
929         struct intel_iommu *iommu = get_irq_data(irq);
930         unsigned long flag;
931
932         /* unmask it */
933         spin_lock_irqsave(&iommu->register_lock, flag);
934         writel(0, iommu->reg + DMAR_FECTL_REG);
935         /* Read a reg to force flush the post write */
936         readl(iommu->reg + DMAR_FECTL_REG);
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938 }
939
940 void dmar_msi_mask(unsigned int irq)
941 {
942         unsigned long flag;
943         struct intel_iommu *iommu = get_irq_data(irq);
944
945         /* mask it */
946         spin_lock_irqsave(&iommu->register_lock, flag);
947         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
948         /* Read a reg to force flush the post write */
949         readl(iommu->reg + DMAR_FECTL_REG);
950         spin_unlock_irqrestore(&iommu->register_lock, flag);
951 }
952
953 void dmar_msi_write(int irq, struct msi_msg *msg)
954 {
955         struct intel_iommu *iommu = get_irq_data(irq);
956         unsigned long flag;
957
958         spin_lock_irqsave(&iommu->register_lock, flag);
959         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
960         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
961         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
962         spin_unlock_irqrestore(&iommu->register_lock, flag);
963 }
964
965 void dmar_msi_read(int irq, struct msi_msg *msg)
966 {
967         struct intel_iommu *iommu = get_irq_data(irq);
968         unsigned long flag;
969
970         spin_lock_irqsave(&iommu->register_lock, flag);
971         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
972         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
973         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
974         spin_unlock_irqrestore(&iommu->register_lock, flag);
975 }
976
977 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
978                 u8 fault_reason, u16 source_id, unsigned long long addr)
979 {
980         const char *reason;
981
982         reason = dmar_get_fault_reason(fault_reason);
983
984         printk(KERN_ERR
985                 "DMAR:[%s] Request device [%02x:%02x.%d] "
986                 "fault addr %llx \n"
987                 "DMAR:[fault reason %02d] %s\n",
988                 (type ? "DMA Read" : "DMA Write"),
989                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
990                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
991         return 0;
992 }
993
994 #define PRIMARY_FAULT_REG_LEN (16)
995 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
996 {
997         struct intel_iommu *iommu = dev_id;
998         int reg, fault_index;
999         u32 fault_status;
1000         unsigned long flag;
1001
1002         spin_lock_irqsave(&iommu->register_lock, flag);
1003         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1004
1005         /* TBD: ignore advanced fault log currently */
1006         if (!(fault_status & DMA_FSTS_PPF))
1007                 goto clear_overflow;
1008
1009         fault_index = dma_fsts_fault_record_index(fault_status);
1010         reg = cap_fault_reg_offset(iommu->cap);
1011         while (1) {
1012                 u8 fault_reason;
1013                 u16 source_id;
1014                 u64 guest_addr;
1015                 int type;
1016                 u32 data;
1017
1018                 /* highest 32 bits */
1019                 data = readl(iommu->reg + reg +
1020                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1021                 if (!(data & DMA_FRCD_F))
1022                         break;
1023
1024                 fault_reason = dma_frcd_fault_reason(data);
1025                 type = dma_frcd_type(data);
1026
1027                 data = readl(iommu->reg + reg +
1028                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1029                 source_id = dma_frcd_source_id(data);
1030
1031                 guest_addr = dmar_readq(iommu->reg + reg +
1032                                 fault_index * PRIMARY_FAULT_REG_LEN);
1033                 guest_addr = dma_frcd_page_addr(guest_addr);
1034                 /* clear the fault */
1035                 writel(DMA_FRCD_F, iommu->reg + reg +
1036                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1037
1038                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1039
1040                 iommu_page_fault_do_one(iommu, type, fault_reason,
1041                                 source_id, guest_addr);
1042
1043                 fault_index++;
1044                 if (fault_index > cap_num_fault_regs(iommu->cap))
1045                         fault_index = 0;
1046                 spin_lock_irqsave(&iommu->register_lock, flag);
1047         }
1048 clear_overflow:
1049         /* clear primary fault overflow */
1050         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1051         if (fault_status & DMA_FSTS_PFO)
1052                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1053
1054         spin_unlock_irqrestore(&iommu->register_lock, flag);
1055         return IRQ_HANDLED;
1056 }
1057
1058 int dmar_set_interrupt(struct intel_iommu *iommu)
1059 {
1060         int irq, ret;
1061
1062         irq = create_irq();
1063         if (!irq) {
1064                 printk(KERN_ERR "IOMMU: no free vectors\n");
1065                 return -EINVAL;
1066         }
1067
1068         set_irq_data(irq, iommu);
1069         iommu->irq = irq;
1070
1071         ret = arch_setup_dmar_msi(irq);
1072         if (ret) {
1073                 set_irq_data(irq, NULL);
1074                 iommu->irq = 0;
1075                 destroy_irq(irq);
1076                 return 0;
1077         }
1078
1079         /* Force fault register is cleared */
1080         iommu_page_fault(irq, iommu);
1081
1082         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1083         if (ret)
1084                 printk(KERN_ERR "IOMMU: can't request irq\n");
1085         return ret;
1086 }
1087
1088 static int iommu_init_domains(struct intel_iommu *iommu)
1089 {
1090         unsigned long ndomains;
1091         unsigned long nlongs;
1092
1093         ndomains = cap_ndoms(iommu->cap);
1094         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1095         nlongs = BITS_TO_LONGS(ndomains);
1096
1097         /* TBD: there might be 64K domains,
1098          * consider other allocation for future chip
1099          */
1100         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1101         if (!iommu->domain_ids) {
1102                 printk(KERN_ERR "Allocating domain id array failed\n");
1103                 return -ENOMEM;
1104         }
1105         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1106                         GFP_KERNEL);
1107         if (!iommu->domains) {
1108                 printk(KERN_ERR "Allocating domain array failed\n");
1109                 kfree(iommu->domain_ids);
1110                 return -ENOMEM;
1111         }
1112
1113         spin_lock_init(&iommu->lock);
1114
1115         /*
1116          * if Caching mode is set, then invalid translations are tagged
1117          * with domainid 0. Hence we need to pre-allocate it.
1118          */
1119         if (cap_caching_mode(iommu->cap))
1120                 set_bit(0, iommu->domain_ids);
1121         return 0;
1122 }
1123
1124
1125 static void domain_exit(struct dmar_domain *domain);
1126
1127 void free_dmar_iommu(struct intel_iommu *iommu)
1128 {
1129         struct dmar_domain *domain;
1130         int i;
1131
1132         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1133         for (; i < cap_ndoms(iommu->cap); ) {
1134                 domain = iommu->domains[i];
1135                 clear_bit(i, iommu->domain_ids);
1136                 domain_exit(domain);
1137                 i = find_next_bit(iommu->domain_ids,
1138                         cap_ndoms(iommu->cap), i+1);
1139         }
1140
1141         if (iommu->gcmd & DMA_GCMD_TE)
1142                 iommu_disable_translation(iommu);
1143
1144         if (iommu->irq) {
1145                 set_irq_data(iommu->irq, NULL);
1146                 /* This will mask the irq */
1147                 free_irq(iommu->irq, iommu);
1148                 destroy_irq(iommu->irq);
1149         }
1150
1151         kfree(iommu->domains);
1152         kfree(iommu->domain_ids);
1153
1154         /* free context mapping */
1155         free_context_table(iommu);
1156 }
1157
1158 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1159 {
1160         unsigned long num;
1161         unsigned long ndomains;
1162         struct dmar_domain *domain;
1163         unsigned long flags;
1164
1165         domain = alloc_domain_mem();
1166         if (!domain)
1167                 return NULL;
1168
1169         ndomains = cap_ndoms(iommu->cap);
1170
1171         spin_lock_irqsave(&iommu->lock, flags);
1172         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1173         if (num >= ndomains) {
1174                 spin_unlock_irqrestore(&iommu->lock, flags);
1175                 free_domain_mem(domain);
1176                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1177                 return NULL;
1178         }
1179
1180         set_bit(num, iommu->domain_ids);
1181         domain->id = num;
1182         domain->iommu = iommu;
1183         iommu->domains[num] = domain;
1184         spin_unlock_irqrestore(&iommu->lock, flags);
1185
1186         return domain;
1187 }
1188
1189 static void iommu_free_domain(struct dmar_domain *domain)
1190 {
1191         unsigned long flags;
1192
1193         spin_lock_irqsave(&domain->iommu->lock, flags);
1194         clear_bit(domain->id, domain->iommu->domain_ids);
1195         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1196 }
1197
1198 static struct iova_domain reserved_iova_list;
1199 static struct lock_class_key reserved_alloc_key;
1200 static struct lock_class_key reserved_rbtree_key;
1201
1202 static void dmar_init_reserved_ranges(void)
1203 {
1204         struct pci_dev *pdev = NULL;
1205         struct iova *iova;
1206         int i;
1207         u64 addr, size;
1208
1209         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1210
1211         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1212                 &reserved_alloc_key);
1213         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1214                 &reserved_rbtree_key);
1215
1216         /* IOAPIC ranges shouldn't be accessed by DMA */
1217         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1218                 IOVA_PFN(IOAPIC_RANGE_END));
1219         if (!iova)
1220                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1221
1222         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1223         for_each_pci_dev(pdev) {
1224                 struct resource *r;
1225
1226                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1227                         r = &pdev->resource[i];
1228                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1229                                 continue;
1230                         addr = r->start;
1231                         addr &= PAGE_MASK;
1232                         size = r->end - addr;
1233                         size = PAGE_ALIGN(size);
1234                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1235                                 IOVA_PFN(size + addr) - 1);
1236                         if (!iova)
1237                                 printk(KERN_ERR "Reserve iova failed\n");
1238                 }
1239         }
1240
1241 }
1242
1243 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1244 {
1245         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1246 }
1247
1248 static inline int guestwidth_to_adjustwidth(int gaw)
1249 {
1250         int agaw;
1251         int r = (gaw - 12) % 9;
1252
1253         if (r == 0)
1254                 agaw = gaw;
1255         else
1256                 agaw = gaw + 9 - r;
1257         if (agaw > 64)
1258                 agaw = 64;
1259         return agaw;
1260 }
1261
1262 static int domain_init(struct dmar_domain *domain, int guest_width)
1263 {
1264         struct intel_iommu *iommu;
1265         int adjust_width, agaw;
1266         unsigned long sagaw;
1267
1268         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1269         spin_lock_init(&domain->mapping_lock);
1270
1271         domain_reserve_special_ranges(domain);
1272
1273         /* calculate AGAW */
1274         iommu = domain->iommu;
1275         if (guest_width > cap_mgaw(iommu->cap))
1276                 guest_width = cap_mgaw(iommu->cap);
1277         domain->gaw = guest_width;
1278         adjust_width = guestwidth_to_adjustwidth(guest_width);
1279         agaw = width_to_agaw(adjust_width);
1280         sagaw = cap_sagaw(iommu->cap);
1281         if (!test_bit(agaw, &sagaw)) {
1282                 /* hardware doesn't support it, choose a bigger one */
1283                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1284                 agaw = find_next_bit(&sagaw, 5, agaw);
1285                 if (agaw >= 5)
1286                         return -ENODEV;
1287         }
1288         domain->agaw = agaw;
1289         INIT_LIST_HEAD(&domain->devices);
1290
1291         /* always allocate the top pgd */
1292         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1293         if (!domain->pgd)
1294                 return -ENOMEM;
1295         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1296         return 0;
1297 }
1298
1299 static void domain_exit(struct dmar_domain *domain)
1300 {
1301         u64 end;
1302
1303         /* Domain 0 is reserved, so dont process it */
1304         if (!domain)
1305                 return;
1306
1307         domain_remove_dev_info(domain);
1308         /* destroy iovas */
1309         put_iova_domain(&domain->iovad);
1310         end = DOMAIN_MAX_ADDR(domain->gaw);
1311         end = end & (~PAGE_MASK);
1312
1313         /* clear ptes */
1314         dma_pte_clear_range(domain, 0, end);
1315
1316         /* free page tables */
1317         dma_pte_free_pagetable(domain, 0, end);
1318
1319         iommu_free_domain(domain);
1320         free_domain_mem(domain);
1321 }
1322
1323 static int domain_context_mapping_one(struct dmar_domain *domain,
1324                 u8 bus, u8 devfn)
1325 {
1326         struct context_entry *context;
1327         struct intel_iommu *iommu = domain->iommu;
1328         unsigned long flags;
1329
1330         pr_debug("Set context mapping for %02x:%02x.%d\n",
1331                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1332         BUG_ON(!domain->pgd);
1333         context = device_to_context_entry(iommu, bus, devfn);
1334         if (!context)
1335                 return -ENOMEM;
1336         spin_lock_irqsave(&iommu->lock, flags);
1337         if (context_present(context)) {
1338                 spin_unlock_irqrestore(&iommu->lock, flags);
1339                 return 0;
1340         }
1341
1342         context_set_domain_id(context, domain->id);
1343         context_set_address_width(context, domain->agaw);
1344         context_set_address_root(context, virt_to_phys(domain->pgd));
1345         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1346         context_set_fault_enable(context);
1347         context_set_present(context);
1348         __iommu_flush_cache(iommu, context, sizeof(*context));
1349
1350         /* it's a non-present to present mapping */
1351         if (iommu->flush.flush_context(iommu, domain->id,
1352                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1353                 DMA_CCMD_DEVICE_INVL, 1))
1354                 iommu_flush_write_buffer(iommu);
1355         else
1356                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1357
1358         spin_unlock_irqrestore(&iommu->lock, flags);
1359         return 0;
1360 }
1361
1362 static int
1363 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1364 {
1365         int ret;
1366         struct pci_dev *tmp, *parent;
1367
1368         ret = domain_context_mapping_one(domain, pdev->bus->number,
1369                 pdev->devfn);
1370         if (ret)
1371                 return ret;
1372
1373         /* dependent device mapping */
1374         tmp = pci_find_upstream_pcie_bridge(pdev);
1375         if (!tmp)
1376                 return 0;
1377         /* Secondary interface's bus number and devfn 0 */
1378         parent = pdev->bus->self;
1379         while (parent != tmp) {
1380                 ret = domain_context_mapping_one(domain, parent->bus->number,
1381                         parent->devfn);
1382                 if (ret)
1383                         return ret;
1384                 parent = parent->bus->self;
1385         }
1386         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1387                 return domain_context_mapping_one(domain,
1388                         tmp->subordinate->number, 0);
1389         else /* this is a legacy PCI bridge */
1390                 return domain_context_mapping_one(domain,
1391                         tmp->bus->number, tmp->devfn);
1392 }
1393
1394 static int domain_context_mapped(struct dmar_domain *domain,
1395         struct pci_dev *pdev)
1396 {
1397         int ret;
1398         struct pci_dev *tmp, *parent;
1399
1400         ret = device_context_mapped(domain->iommu,
1401                 pdev->bus->number, pdev->devfn);
1402         if (!ret)
1403                 return ret;
1404         /* dependent device mapping */
1405         tmp = pci_find_upstream_pcie_bridge(pdev);
1406         if (!tmp)
1407                 return ret;
1408         /* Secondary interface's bus number and devfn 0 */
1409         parent = pdev->bus->self;
1410         while (parent != tmp) {
1411                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1412                         parent->devfn);
1413                 if (!ret)
1414                         return ret;
1415                 parent = parent->bus->self;
1416         }
1417         if (tmp->is_pcie)
1418                 return device_context_mapped(domain->iommu,
1419                         tmp->subordinate->number, 0);
1420         else
1421                 return device_context_mapped(domain->iommu,
1422                         tmp->bus->number, tmp->devfn);
1423 }
1424
1425 static int
1426 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1427                         u64 hpa, size_t size, int prot)
1428 {
1429         u64 start_pfn, end_pfn;
1430         struct dma_pte *pte;
1431         int index;
1432         int addr_width = agaw_to_width(domain->agaw);
1433
1434         hpa &= (((u64)1) << addr_width) - 1;
1435
1436         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1437                 return -EINVAL;
1438         iova &= PAGE_MASK;
1439         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1440         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1441         index = 0;
1442         while (start_pfn < end_pfn) {
1443                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1444                 if (!pte)
1445                         return -ENOMEM;
1446                 /* We don't need lock here, nobody else
1447                  * touches the iova range
1448                  */
1449                 BUG_ON(dma_pte_addr(pte));
1450                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1451                 dma_set_pte_prot(pte, prot);
1452                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1453                 start_pfn++;
1454                 index++;
1455         }
1456         return 0;
1457 }
1458
1459 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1460 {
1461         clear_context_table(domain->iommu, bus, devfn);
1462         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1463                                            DMA_CCMD_GLOBAL_INVL, 0);
1464         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1465                                          DMA_TLB_GLOBAL_FLUSH, 0);
1466 }
1467
1468 static void domain_remove_dev_info(struct dmar_domain *domain)
1469 {
1470         struct device_domain_info *info;
1471         unsigned long flags;
1472
1473         spin_lock_irqsave(&device_domain_lock, flags);
1474         while (!list_empty(&domain->devices)) {
1475                 info = list_entry(domain->devices.next,
1476                         struct device_domain_info, link);
1477                 list_del(&info->link);
1478                 list_del(&info->global);
1479                 if (info->dev)
1480                         info->dev->dev.archdata.iommu = NULL;
1481                 spin_unlock_irqrestore(&device_domain_lock, flags);
1482
1483                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1484                 free_devinfo_mem(info);
1485
1486                 spin_lock_irqsave(&device_domain_lock, flags);
1487         }
1488         spin_unlock_irqrestore(&device_domain_lock, flags);
1489 }
1490
1491 /*
1492  * find_domain
1493  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1494  */
1495 static struct dmar_domain *
1496 find_domain(struct pci_dev *pdev)
1497 {
1498         struct device_domain_info *info;
1499
1500         /* No lock here, assumes no domain exit in normal case */
1501         info = pdev->dev.archdata.iommu;
1502         if (info)
1503                 return info->domain;
1504         return NULL;
1505 }
1506
1507 /* domain is initialized */
1508 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1509 {
1510         struct dmar_domain *domain, *found = NULL;
1511         struct intel_iommu *iommu;
1512         struct dmar_drhd_unit *drhd;
1513         struct device_domain_info *info, *tmp;
1514         struct pci_dev *dev_tmp;
1515         unsigned long flags;
1516         int bus = 0, devfn = 0;
1517
1518         domain = find_domain(pdev);
1519         if (domain)
1520                 return domain;
1521
1522         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1523         if (dev_tmp) {
1524                 if (dev_tmp->is_pcie) {
1525                         bus = dev_tmp->subordinate->number;
1526                         devfn = 0;
1527                 } else {
1528                         bus = dev_tmp->bus->number;
1529                         devfn = dev_tmp->devfn;
1530                 }
1531                 spin_lock_irqsave(&device_domain_lock, flags);
1532                 list_for_each_entry(info, &device_domain_list, global) {
1533                         if (info->bus == bus && info->devfn == devfn) {
1534                                 found = info->domain;
1535                                 break;
1536                         }
1537                 }
1538                 spin_unlock_irqrestore(&device_domain_lock, flags);
1539                 /* pcie-pci bridge already has a domain, uses it */
1540                 if (found) {
1541                         domain = found;
1542                         goto found_domain;
1543                 }
1544         }
1545
1546         /* Allocate new domain for the device */
1547         drhd = dmar_find_matched_drhd_unit(pdev);
1548         if (!drhd) {
1549                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1550                         pci_name(pdev));
1551                 return NULL;
1552         }
1553         iommu = drhd->iommu;
1554
1555         domain = iommu_alloc_domain(iommu);
1556         if (!domain)
1557                 goto error;
1558
1559         if (domain_init(domain, gaw)) {
1560                 domain_exit(domain);
1561                 goto error;
1562         }
1563
1564         /* register pcie-to-pci device */
1565         if (dev_tmp) {
1566                 info = alloc_devinfo_mem();
1567                 if (!info) {
1568                         domain_exit(domain);
1569                         goto error;
1570                 }
1571                 info->bus = bus;
1572                 info->devfn = devfn;
1573                 info->dev = NULL;
1574                 info->domain = domain;
1575                 /* This domain is shared by devices under p2p bridge */
1576                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1577
1578                 /* pcie-to-pci bridge already has a domain, uses it */
1579                 found = NULL;
1580                 spin_lock_irqsave(&device_domain_lock, flags);
1581                 list_for_each_entry(tmp, &device_domain_list, global) {
1582                         if (tmp->bus == bus && tmp->devfn == devfn) {
1583                                 found = tmp->domain;
1584                                 break;
1585                         }
1586                 }
1587                 if (found) {
1588                         free_devinfo_mem(info);
1589                         domain_exit(domain);
1590                         domain = found;
1591                 } else {
1592                         list_add(&info->link, &domain->devices);
1593                         list_add(&info->global, &device_domain_list);
1594                 }
1595                 spin_unlock_irqrestore(&device_domain_lock, flags);
1596         }
1597
1598 found_domain:
1599         info = alloc_devinfo_mem();
1600         if (!info)
1601                 goto error;
1602         info->bus = pdev->bus->number;
1603         info->devfn = pdev->devfn;
1604         info->dev = pdev;
1605         info->domain = domain;
1606         spin_lock_irqsave(&device_domain_lock, flags);
1607         /* somebody is fast */
1608         found = find_domain(pdev);
1609         if (found != NULL) {
1610                 spin_unlock_irqrestore(&device_domain_lock, flags);
1611                 if (found != domain) {
1612                         domain_exit(domain);
1613                         domain = found;
1614                 }
1615                 free_devinfo_mem(info);
1616                 return domain;
1617         }
1618         list_add(&info->link, &domain->devices);
1619         list_add(&info->global, &device_domain_list);
1620         pdev->dev.archdata.iommu = info;
1621         spin_unlock_irqrestore(&device_domain_lock, flags);
1622         return domain;
1623 error:
1624         /* recheck it here, maybe others set it */
1625         return find_domain(pdev);
1626 }
1627
1628 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1629                                       unsigned long long start,
1630                                       unsigned long long end)
1631 {
1632         struct dmar_domain *domain;
1633         unsigned long size;
1634         unsigned long long base;
1635         int ret;
1636
1637         printk(KERN_INFO
1638                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1639                 pci_name(pdev), start, end);
1640         /* page table init */
1641         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1642         if (!domain)
1643                 return -ENOMEM;
1644
1645         /* The address might not be aligned */
1646         base = start & PAGE_MASK;
1647         size = end - base;
1648         size = PAGE_ALIGN(size);
1649         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1650                         IOVA_PFN(base + size) - 1)) {
1651                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1652                 ret = -ENOMEM;
1653                 goto error;
1654         }
1655
1656         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1657                 size, base, pci_name(pdev));
1658         /*
1659          * RMRR range might have overlap with physical memory range,
1660          * clear it first
1661          */
1662         dma_pte_clear_range(domain, base, base + size);
1663
1664         ret = domain_page_mapping(domain, base, base, size,
1665                 DMA_PTE_READ|DMA_PTE_WRITE);
1666         if (ret)
1667                 goto error;
1668
1669         /* context entry init */
1670         ret = domain_context_mapping(domain, pdev);
1671         if (!ret)
1672                 return 0;
1673 error:
1674         domain_exit(domain);
1675         return ret;
1676
1677 }
1678
1679 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1680         struct pci_dev *pdev)
1681 {
1682         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1683                 return 0;
1684         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1685                 rmrr->end_address + 1);
1686 }
1687
1688 #ifdef CONFIG_DMAR_GFX_WA
1689 struct iommu_prepare_data {
1690         struct pci_dev *pdev;
1691         int ret;
1692 };
1693
1694 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1695                                          unsigned long end_pfn, void *datax)
1696 {
1697         struct iommu_prepare_data *data;
1698
1699         data = (struct iommu_prepare_data *)datax;
1700
1701         data->ret = iommu_prepare_identity_map(data->pdev,
1702                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1703         return data->ret;
1704
1705 }
1706
1707 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1708 {
1709         int nid;
1710         struct iommu_prepare_data data;
1711
1712         data.pdev = pdev;
1713         data.ret = 0;
1714
1715         for_each_online_node(nid) {
1716                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1717                 if (data.ret)
1718                         return data.ret;
1719         }
1720         return data.ret;
1721 }
1722
1723 static void __init iommu_prepare_gfx_mapping(void)
1724 {
1725         struct pci_dev *pdev = NULL;
1726         int ret;
1727
1728         for_each_pci_dev(pdev) {
1729                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1730                                 !IS_GFX_DEVICE(pdev))
1731                         continue;
1732                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1733                         pci_name(pdev));
1734                 ret = iommu_prepare_with_active_regions(pdev);
1735                 if (ret)
1736                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1737         }
1738 }
1739 #else /* !CONFIG_DMAR_GFX_WA */
1740 static inline void iommu_prepare_gfx_mapping(void)
1741 {
1742         return;
1743 }
1744 #endif
1745
1746 #ifdef CONFIG_DMAR_FLOPPY_WA
1747 static inline void iommu_prepare_isa(void)
1748 {
1749         struct pci_dev *pdev;
1750         int ret;
1751
1752         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1753         if (!pdev)
1754                 return;
1755
1756         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1757         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1758
1759         if (ret)
1760                 printk("IOMMU: Failed to create 0-64M identity map, "
1761                         "floppy might not work\n");
1762
1763 }
1764 #else
1765 static inline void iommu_prepare_isa(void)
1766 {
1767         return;
1768 }
1769 #endif /* !CONFIG_DMAR_FLPY_WA */
1770
1771 static int __init init_dmars(void)
1772 {
1773         struct dmar_drhd_unit *drhd;
1774         struct dmar_rmrr_unit *rmrr;
1775         struct pci_dev *pdev;
1776         struct intel_iommu *iommu;
1777         int i, ret, unit = 0;
1778
1779         /*
1780          * for each drhd
1781          *    allocate root
1782          *    initialize and program root entry to not present
1783          * endfor
1784          */
1785         for_each_drhd_unit(drhd) {
1786                 g_num_of_iommus++;
1787                 /*
1788                  * lock not needed as this is only incremented in the single
1789                  * threaded kernel __init code path all other access are read
1790                  * only
1791                  */
1792         }
1793
1794         deferred_flush = kzalloc(g_num_of_iommus *
1795                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1796         if (!deferred_flush) {
1797                 ret = -ENOMEM;
1798                 goto error;
1799         }
1800
1801         for_each_drhd_unit(drhd) {
1802                 if (drhd->ignored)
1803                         continue;
1804
1805                 iommu = drhd->iommu;
1806
1807                 ret = iommu_init_domains(iommu);
1808                 if (ret)
1809                         goto error;
1810
1811                 /*
1812                  * TBD:
1813                  * we could share the same root & context tables
1814                  * amoung all IOMMU's. Need to Split it later.
1815                  */
1816                 ret = iommu_alloc_root_entry(iommu);
1817                 if (ret) {
1818                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1819                         goto error;
1820                 }
1821         }
1822
1823         for_each_drhd_unit(drhd) {
1824                 if (drhd->ignored)
1825                         continue;
1826
1827                 iommu = drhd->iommu;
1828                 if (dmar_enable_qi(iommu)) {
1829                         /*
1830                          * Queued Invalidate not enabled, use Register Based
1831                          * Invalidate
1832                          */
1833                         iommu->flush.flush_context = __iommu_flush_context;
1834                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1835                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1836                                "invalidation\n",
1837                                (unsigned long long)drhd->reg_base_addr);
1838                 } else {
1839                         iommu->flush.flush_context = qi_flush_context;
1840                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1841                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1842                                "invalidation\n",
1843                                (unsigned long long)drhd->reg_base_addr);
1844                 }
1845         }
1846
1847         /*
1848          * For each rmrr
1849          *   for each dev attached to rmrr
1850          *   do
1851          *     locate drhd for dev, alloc domain for dev
1852          *     allocate free domain
1853          *     allocate page table entries for rmrr
1854          *     if context not allocated for bus
1855          *           allocate and init context
1856          *           set present in root table for this bus
1857          *     init context with domain, translation etc
1858          *    endfor
1859          * endfor
1860          */
1861         for_each_rmrr_units(rmrr) {
1862                 for (i = 0; i < rmrr->devices_cnt; i++) {
1863                         pdev = rmrr->devices[i];
1864                         /* some BIOS lists non-exist devices in DMAR table */
1865                         if (!pdev)
1866                                 continue;
1867                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1868                         if (ret)
1869                                 printk(KERN_ERR
1870                                  "IOMMU: mapping reserved region failed\n");
1871                 }
1872         }
1873
1874         iommu_prepare_gfx_mapping();
1875
1876         iommu_prepare_isa();
1877
1878         /*
1879          * for each drhd
1880          *   enable fault log
1881          *   global invalidate context cache
1882          *   global invalidate iotlb
1883          *   enable translation
1884          */
1885         for_each_drhd_unit(drhd) {
1886                 if (drhd->ignored)
1887                         continue;
1888                 iommu = drhd->iommu;
1889                 sprintf (iommu->name, "dmar%d", unit++);
1890
1891                 iommu_flush_write_buffer(iommu);
1892
1893                 ret = dmar_set_interrupt(iommu);
1894                 if (ret)
1895                         goto error;
1896
1897                 iommu_set_root_entry(iommu);
1898
1899                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1900                                            0);
1901                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1902                                          0);
1903                 iommu_disable_protect_mem_regions(iommu);
1904
1905                 ret = iommu_enable_translation(iommu);
1906                 if (ret)
1907                         goto error;
1908         }
1909
1910         return 0;
1911 error:
1912         for_each_drhd_unit(drhd) {
1913                 if (drhd->ignored)
1914                         continue;
1915                 iommu = drhd->iommu;
1916                 free_iommu(iommu);
1917         }
1918         return ret;
1919 }
1920
1921 static inline u64 aligned_size(u64 host_addr, size_t size)
1922 {
1923         u64 addr;
1924         addr = (host_addr & (~PAGE_MASK)) + size;
1925         return PAGE_ALIGN(addr);
1926 }
1927
1928 struct iova *
1929 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1930 {
1931         struct iova *piova;
1932
1933         /* Make sure it's in range */
1934         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1935         if (!size || (IOVA_START_ADDR + size > end))
1936                 return NULL;
1937
1938         piova = alloc_iova(&domain->iovad,
1939                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1940         return piova;
1941 }
1942
1943 static struct iova *
1944 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1945                    size_t size, u64 dma_mask)
1946 {
1947         struct pci_dev *pdev = to_pci_dev(dev);
1948         struct iova *iova = NULL;
1949
1950         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1951                 iova = iommu_alloc_iova(domain, size, dma_mask);
1952         else {
1953                 /*
1954                  * First try to allocate an io virtual address in
1955                  * DMA_32BIT_MASK and if that fails then try allocating
1956                  * from higher range
1957                  */
1958                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1959                 if (!iova)
1960                         iova = iommu_alloc_iova(domain, size, dma_mask);
1961         }
1962
1963         if (!iova) {
1964                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1965                 return NULL;
1966         }
1967
1968         return iova;
1969 }
1970
1971 static struct dmar_domain *
1972 get_valid_domain_for_dev(struct pci_dev *pdev)
1973 {
1974         struct dmar_domain *domain;
1975         int ret;
1976
1977         domain = get_domain_for_dev(pdev,
1978                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1979         if (!domain) {
1980                 printk(KERN_ERR
1981                         "Allocating domain for %s failed", pci_name(pdev));
1982                 return NULL;
1983         }
1984
1985         /* make sure context mapping is ok */
1986         if (unlikely(!domain_context_mapped(domain, pdev))) {
1987                 ret = domain_context_mapping(domain, pdev);
1988                 if (ret) {
1989                         printk(KERN_ERR
1990                                 "Domain context map for %s failed",
1991                                 pci_name(pdev));
1992                         return NULL;
1993                 }
1994         }
1995
1996         return domain;
1997 }
1998
1999 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2000                                      size_t size, int dir, u64 dma_mask)
2001 {
2002         struct pci_dev *pdev = to_pci_dev(hwdev);
2003         struct dmar_domain *domain;
2004         phys_addr_t start_paddr;
2005         struct iova *iova;
2006         int prot = 0;
2007         int ret;
2008
2009         BUG_ON(dir == DMA_NONE);
2010         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2011                 return paddr;
2012
2013         domain = get_valid_domain_for_dev(pdev);
2014         if (!domain)
2015                 return 0;
2016
2017         size = aligned_size((u64)paddr, size);
2018
2019         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2020         if (!iova)
2021                 goto error;
2022
2023         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2024
2025         /*
2026          * Check if DMAR supports zero-length reads on write only
2027          * mappings..
2028          */
2029         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2030                         !cap_zlr(domain->iommu->cap))
2031                 prot |= DMA_PTE_READ;
2032         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2033                 prot |= DMA_PTE_WRITE;
2034         /*
2035          * paddr - (paddr + size) might be partial page, we should map the whole
2036          * page.  Note: if two part of one page are separately mapped, we
2037          * might have two guest_addr mapping to the same host paddr, but this
2038          * is not a big problem
2039          */
2040         ret = domain_page_mapping(domain, start_paddr,
2041                 ((u64)paddr) & PAGE_MASK, size, prot);
2042         if (ret)
2043                 goto error;
2044
2045         /* it's a non-present to present mapping */
2046         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
2047                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2048         if (ret)
2049                 iommu_flush_write_buffer(domain->iommu);
2050
2051         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2052
2053 error:
2054         if (iova)
2055                 __free_iova(&domain->iovad, iova);
2056         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2057                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2058         return 0;
2059 }
2060
2061 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2062                             size_t size, int dir)
2063 {
2064         return __intel_map_single(hwdev, paddr, size, dir,
2065                                   to_pci_dev(hwdev)->dma_mask);
2066 }
2067
2068 static void flush_unmaps(void)
2069 {
2070         int i, j;
2071
2072         timer_on = 0;
2073
2074         /* just flush them all */
2075         for (i = 0; i < g_num_of_iommus; i++) {
2076                 if (deferred_flush[i].next) {
2077                         struct intel_iommu *iommu =
2078                                 deferred_flush[i].domain[0]->iommu;
2079
2080                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2081                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2082                         for (j = 0; j < deferred_flush[i].next; j++) {
2083                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2084                                                 deferred_flush[i].iova[j]);
2085                         }
2086                         deferred_flush[i].next = 0;
2087                 }
2088         }
2089
2090         list_size = 0;
2091 }
2092
2093 static void flush_unmaps_timeout(unsigned long data)
2094 {
2095         unsigned long flags;
2096
2097         spin_lock_irqsave(&async_umap_flush_lock, flags);
2098         flush_unmaps();
2099         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2100 }
2101
2102 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2103 {
2104         unsigned long flags;
2105         int next, iommu_id;
2106
2107         spin_lock_irqsave(&async_umap_flush_lock, flags);
2108         if (list_size == HIGH_WATER_MARK)
2109                 flush_unmaps();
2110
2111         iommu_id = dom->iommu->seq_id;
2112
2113         next = deferred_flush[iommu_id].next;
2114         deferred_flush[iommu_id].domain[next] = dom;
2115         deferred_flush[iommu_id].iova[next] = iova;
2116         deferred_flush[iommu_id].next++;
2117
2118         if (!timer_on) {
2119                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2120                 timer_on = 1;
2121         }
2122         list_size++;
2123         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2124 }
2125
2126 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2127                         int dir)
2128 {
2129         struct pci_dev *pdev = to_pci_dev(dev);
2130         struct dmar_domain *domain;
2131         unsigned long start_addr;
2132         struct iova *iova;
2133
2134         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2135                 return;
2136         domain = find_domain(pdev);
2137         BUG_ON(!domain);
2138
2139         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2140         if (!iova)
2141                 return;
2142
2143         start_addr = iova->pfn_lo << PAGE_SHIFT;
2144         size = aligned_size((u64)dev_addr, size);
2145
2146         pr_debug("Device %s unmapping: %lx@%llx\n",
2147                 pci_name(pdev), size, (unsigned long long)start_addr);
2148
2149         /*  clear the whole page */
2150         dma_pte_clear_range(domain, start_addr, start_addr + size);
2151         /* free page tables */
2152         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2153         if (intel_iommu_strict) {
2154                 if (iommu_flush_iotlb_psi(domain->iommu,
2155                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2156                         iommu_flush_write_buffer(domain->iommu);
2157                 /* free iova */
2158                 __free_iova(&domain->iovad, iova);
2159         } else {
2160                 add_unmap(domain, iova);
2161                 /*
2162                  * queue up the release of the unmap to save the 1/6th of the
2163                  * cpu used up by the iotlb flush operation...
2164                  */
2165         }
2166 }
2167
2168 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2169                            dma_addr_t *dma_handle, gfp_t flags)
2170 {
2171         void *vaddr;
2172         int order;
2173
2174         size = PAGE_ALIGN(size);
2175         order = get_order(size);
2176         flags &= ~(GFP_DMA | GFP_DMA32);
2177
2178         vaddr = (void *)__get_free_pages(flags, order);
2179         if (!vaddr)
2180                 return NULL;
2181         memset(vaddr, 0, size);
2182
2183         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2184                                          DMA_BIDIRECTIONAL,
2185                                          hwdev->coherent_dma_mask);
2186         if (*dma_handle)
2187                 return vaddr;
2188         free_pages((unsigned long)vaddr, order);
2189         return NULL;
2190 }
2191
2192 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2193                          dma_addr_t dma_handle)
2194 {
2195         int order;
2196
2197         size = PAGE_ALIGN(size);
2198         order = get_order(size);
2199
2200         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2201         free_pages((unsigned long)vaddr, order);
2202 }
2203
2204 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2205
2206 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2207                     int nelems, int dir)
2208 {
2209         int i;
2210         struct pci_dev *pdev = to_pci_dev(hwdev);
2211         struct dmar_domain *domain;
2212         unsigned long start_addr;
2213         struct iova *iova;
2214         size_t size = 0;
2215         void *addr;
2216         struct scatterlist *sg;
2217
2218         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2219                 return;
2220
2221         domain = find_domain(pdev);
2222
2223         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2224         if (!iova)
2225                 return;
2226         for_each_sg(sglist, sg, nelems, i) {
2227                 addr = SG_ENT_VIRT_ADDRESS(sg);
2228                 size += aligned_size((u64)addr, sg->length);
2229         }
2230
2231         start_addr = iova->pfn_lo << PAGE_SHIFT;
2232
2233         /*  clear the whole page */
2234         dma_pte_clear_range(domain, start_addr, start_addr + size);
2235         /* free page tables */
2236         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2237
2238         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2239                         size >> VTD_PAGE_SHIFT, 0))
2240                 iommu_flush_write_buffer(domain->iommu);
2241
2242         /* free iova */
2243         __free_iova(&domain->iovad, iova);
2244 }
2245
2246 static int intel_nontranslate_map_sg(struct device *hddev,
2247         struct scatterlist *sglist, int nelems, int dir)
2248 {
2249         int i;
2250         struct scatterlist *sg;
2251
2252         for_each_sg(sglist, sg, nelems, i) {
2253                 BUG_ON(!sg_page(sg));
2254                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2255                 sg->dma_length = sg->length;
2256         }
2257         return nelems;
2258 }
2259
2260 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2261                  int dir)
2262 {
2263         void *addr;
2264         int i;
2265         struct pci_dev *pdev = to_pci_dev(hwdev);
2266         struct dmar_domain *domain;
2267         size_t size = 0;
2268         int prot = 0;
2269         size_t offset = 0;
2270         struct iova *iova = NULL;
2271         int ret;
2272         struct scatterlist *sg;
2273         unsigned long start_addr;
2274
2275         BUG_ON(dir == DMA_NONE);
2276         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2277                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2278
2279         domain = get_valid_domain_for_dev(pdev);
2280         if (!domain)
2281                 return 0;
2282
2283         for_each_sg(sglist, sg, nelems, i) {
2284                 addr = SG_ENT_VIRT_ADDRESS(sg);
2285                 addr = (void *)virt_to_phys(addr);
2286                 size += aligned_size((u64)addr, sg->length);
2287         }
2288
2289         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2290         if (!iova) {
2291                 sglist->dma_length = 0;
2292                 return 0;
2293         }
2294
2295         /*
2296          * Check if DMAR supports zero-length reads on write only
2297          * mappings..
2298          */
2299         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2300                         !cap_zlr(domain->iommu->cap))
2301                 prot |= DMA_PTE_READ;
2302         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2303                 prot |= DMA_PTE_WRITE;
2304
2305         start_addr = iova->pfn_lo << PAGE_SHIFT;
2306         offset = 0;
2307         for_each_sg(sglist, sg, nelems, i) {
2308                 addr = SG_ENT_VIRT_ADDRESS(sg);
2309                 addr = (void *)virt_to_phys(addr);
2310                 size = aligned_size((u64)addr, sg->length);
2311                 ret = domain_page_mapping(domain, start_addr + offset,
2312                         ((u64)addr) & PAGE_MASK,
2313                         size, prot);
2314                 if (ret) {
2315                         /*  clear the page */
2316                         dma_pte_clear_range(domain, start_addr,
2317                                   start_addr + offset);
2318                         /* free page tables */
2319                         dma_pte_free_pagetable(domain, start_addr,
2320                                   start_addr + offset);
2321                         /* free iova */
2322                         __free_iova(&domain->iovad, iova);
2323                         return 0;
2324                 }
2325                 sg->dma_address = start_addr + offset +
2326                                 ((u64)addr & (~PAGE_MASK));
2327                 sg->dma_length = sg->length;
2328                 offset += size;
2329         }
2330
2331         /* it's a non-present to present mapping */
2332         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2333                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2334                 iommu_flush_write_buffer(domain->iommu);
2335         return nelems;
2336 }
2337
2338 static struct dma_mapping_ops intel_dma_ops = {
2339         .alloc_coherent = intel_alloc_coherent,
2340         .free_coherent = intel_free_coherent,
2341         .map_single = intel_map_single,
2342         .unmap_single = intel_unmap_single,
2343         .map_sg = intel_map_sg,
2344         .unmap_sg = intel_unmap_sg,
2345 };
2346
2347 static inline int iommu_domain_cache_init(void)
2348 {
2349         int ret = 0;
2350
2351         iommu_domain_cache = kmem_cache_create("iommu_domain",
2352                                          sizeof(struct dmar_domain),
2353                                          0,
2354                                          SLAB_HWCACHE_ALIGN,
2355
2356                                          NULL);
2357         if (!iommu_domain_cache) {
2358                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2359                 ret = -ENOMEM;
2360         }
2361
2362         return ret;
2363 }
2364
2365 static inline int iommu_devinfo_cache_init(void)
2366 {
2367         int ret = 0;
2368
2369         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2370                                          sizeof(struct device_domain_info),
2371                                          0,
2372                                          SLAB_HWCACHE_ALIGN,
2373                                          NULL);
2374         if (!iommu_devinfo_cache) {
2375                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2376                 ret = -ENOMEM;
2377         }
2378
2379         return ret;
2380 }
2381
2382 static inline int iommu_iova_cache_init(void)
2383 {
2384         int ret = 0;
2385
2386         iommu_iova_cache = kmem_cache_create("iommu_iova",
2387                                          sizeof(struct iova),
2388                                          0,
2389                                          SLAB_HWCACHE_ALIGN,
2390                                          NULL);
2391         if (!iommu_iova_cache) {
2392                 printk(KERN_ERR "Couldn't create iova cache\n");
2393                 ret = -ENOMEM;
2394         }
2395
2396         return ret;
2397 }
2398
2399 static int __init iommu_init_mempool(void)
2400 {
2401         int ret;
2402         ret = iommu_iova_cache_init();
2403         if (ret)
2404                 return ret;
2405
2406         ret = iommu_domain_cache_init();
2407         if (ret)
2408                 goto domain_error;
2409
2410         ret = iommu_devinfo_cache_init();
2411         if (!ret)
2412                 return ret;
2413
2414         kmem_cache_destroy(iommu_domain_cache);
2415 domain_error:
2416         kmem_cache_destroy(iommu_iova_cache);
2417
2418         return -ENOMEM;
2419 }
2420
2421 static void __init iommu_exit_mempool(void)
2422 {
2423         kmem_cache_destroy(iommu_devinfo_cache);
2424         kmem_cache_destroy(iommu_domain_cache);
2425         kmem_cache_destroy(iommu_iova_cache);
2426
2427 }
2428
2429 static void __init init_no_remapping_devices(void)
2430 {
2431         struct dmar_drhd_unit *drhd;
2432
2433         for_each_drhd_unit(drhd) {
2434                 if (!drhd->include_all) {
2435                         int i;
2436                         for (i = 0; i < drhd->devices_cnt; i++)
2437                                 if (drhd->devices[i] != NULL)
2438                                         break;
2439                         /* ignore DMAR unit if no pci devices exist */
2440                         if (i == drhd->devices_cnt)
2441                                 drhd->ignored = 1;
2442                 }
2443         }
2444
2445         if (dmar_map_gfx)
2446                 return;
2447
2448         for_each_drhd_unit(drhd) {
2449                 int i;
2450                 if (drhd->ignored || drhd->include_all)
2451                         continue;
2452
2453                 for (i = 0; i < drhd->devices_cnt; i++)
2454                         if (drhd->devices[i] &&
2455                                 !IS_GFX_DEVICE(drhd->devices[i]))
2456                                 break;
2457
2458                 if (i < drhd->devices_cnt)
2459                         continue;
2460
2461                 /* bypass IOMMU if it is just for gfx devices */
2462                 drhd->ignored = 1;
2463                 for (i = 0; i < drhd->devices_cnt; i++) {
2464                         if (!drhd->devices[i])
2465                                 continue;
2466                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2467                 }
2468         }
2469 }
2470
2471 int __init intel_iommu_init(void)
2472 {
2473         int ret = 0;
2474
2475         if (dmar_table_init())
2476                 return  -ENODEV;
2477
2478         if (dmar_dev_scope_init())
2479                 return  -ENODEV;
2480
2481         /*
2482          * Check the need for DMA-remapping initialization now.
2483          * Above initialization will also be used by Interrupt-remapping.
2484          */
2485         if (no_iommu || swiotlb || dmar_disabled)
2486                 return -ENODEV;
2487
2488         iommu_init_mempool();
2489         dmar_init_reserved_ranges();
2490
2491         init_no_remapping_devices();
2492
2493         ret = init_dmars();
2494         if (ret) {
2495                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2496                 put_iova_domain(&reserved_iova_list);
2497                 iommu_exit_mempool();
2498                 return ret;
2499         }
2500         printk(KERN_INFO
2501         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2502
2503         init_timer(&unmap_timer);
2504         force_iommu = 1;
2505         dma_ops = &intel_dma_ops;
2506         return 0;
2507 }
2508
2509 void intel_iommu_domain_exit(struct dmar_domain *domain)
2510 {
2511         u64 end;
2512
2513         /* Domain 0 is reserved, so dont process it */
2514         if (!domain)
2515                 return;
2516
2517         end = DOMAIN_MAX_ADDR(domain->gaw);
2518         end = end & (~VTD_PAGE_MASK);
2519
2520         /* clear ptes */
2521         dma_pte_clear_range(domain, 0, end);
2522
2523         /* free page tables */
2524         dma_pte_free_pagetable(domain, 0, end);
2525
2526         iommu_free_domain(domain);
2527         free_domain_mem(domain);
2528 }
2529 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2530
2531 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2532 {
2533         struct dmar_drhd_unit *drhd;
2534         struct dmar_domain *domain;
2535         struct intel_iommu *iommu;
2536
2537         drhd = dmar_find_matched_drhd_unit(pdev);
2538         if (!drhd) {
2539                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2540                 return NULL;
2541         }
2542
2543         iommu = drhd->iommu;
2544         if (!iommu) {
2545                 printk(KERN_ERR
2546                         "intel_iommu_domain_alloc: iommu == NULL\n");
2547                 return NULL;
2548         }
2549         domain = iommu_alloc_domain(iommu);
2550         if (!domain) {
2551                 printk(KERN_ERR
2552                         "intel_iommu_domain_alloc: domain == NULL\n");
2553                 return NULL;
2554         }
2555         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2556                 printk(KERN_ERR
2557                         "intel_iommu_domain_alloc: domain_init() failed\n");
2558                 intel_iommu_domain_exit(domain);
2559                 return NULL;
2560         }
2561         return domain;
2562 }
2563 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2564
2565 int intel_iommu_context_mapping(
2566         struct dmar_domain *domain, struct pci_dev *pdev)
2567 {
2568         int rc;
2569         rc = domain_context_mapping(domain, pdev);
2570         return rc;
2571 }
2572 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2573
2574 int intel_iommu_page_mapping(
2575         struct dmar_domain *domain, dma_addr_t iova,
2576         u64 hpa, size_t size, int prot)
2577 {
2578         int rc;
2579         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2580         return rc;
2581 }
2582 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2583
2584 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2585 {
2586         detach_domain_for_dev(domain, bus, devfn);
2587 }
2588 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2589
2590 struct dmar_domain *
2591 intel_iommu_find_domain(struct pci_dev *pdev)
2592 {
2593         return find_domain(pdev);
2594 }
2595 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2596
2597 int intel_iommu_found(void)
2598 {
2599         return g_num_of_iommus;
2600 }
2601 EXPORT_SYMBOL_GPL(intel_iommu_found);
2602
2603 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2604 {
2605         struct dma_pte *pte;
2606         u64 pfn;
2607
2608         pfn = 0;
2609         pte = addr_to_dma_pte(domain, iova);
2610
2611         if (pte)
2612                 pfn = dma_pte_addr(pte);
2613
2614         return pfn >> VTD_PAGE_SHIFT;
2615 }
2616 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);