9f5d9151edc9702e8f6c6a931e610764a2a2e250
[firefly-linux-kernel-4.4.55.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <asm/cacheflush.h>
41 #include <asm/iommu.h>
42 #include "pci.h"
43
44 #define ROOT_SIZE               VTD_PAGE_SIZE
45 #define CONTEXT_SIZE            VTD_PAGE_SIZE
46
47 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
48 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49
50 #define IOAPIC_RANGE_START      (0xfee00000)
51 #define IOAPIC_RANGE_END        (0xfeefffff)
52 #define IOVA_START_ADDR         (0x1000)
53
54 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55
56 #define MAX_AGAW_WIDTH 64
57
58 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
59
60 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
61 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
62 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
63
64 /* global iommu list, set NULL for ignored DMAR units */
65 static struct intel_iommu **g_iommus;
66
67 static int rwbf_quirk;
68
69 /*
70  * 0: Present
71  * 1-11: Reserved
72  * 12-63: Context Ptr (12 - (haw-1))
73  * 64-127: Reserved
74  */
75 struct root_entry {
76         u64     val;
77         u64     rsvd1;
78 };
79 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
80 static inline bool root_present(struct root_entry *root)
81 {
82         return (root->val & 1);
83 }
84 static inline void set_root_present(struct root_entry *root)
85 {
86         root->val |= 1;
87 }
88 static inline void set_root_value(struct root_entry *root, unsigned long value)
89 {
90         root->val |= value & VTD_PAGE_MASK;
91 }
92
93 static inline struct context_entry *
94 get_context_addr_from_root(struct root_entry *root)
95 {
96         return (struct context_entry *)
97                 (root_present(root)?phys_to_virt(
98                 root->val & VTD_PAGE_MASK) :
99                 NULL);
100 }
101
102 /*
103  * low 64 bits:
104  * 0: present
105  * 1: fault processing disable
106  * 2-3: translation type
107  * 12-63: address space root
108  * high 64 bits:
109  * 0-2: address width
110  * 3-6: aval
111  * 8-23: domain id
112  */
113 struct context_entry {
114         u64 lo;
115         u64 hi;
116 };
117
118 static inline bool context_present(struct context_entry *context)
119 {
120         return (context->lo & 1);
121 }
122 static inline void context_set_present(struct context_entry *context)
123 {
124         context->lo |= 1;
125 }
126
127 static inline void context_set_fault_enable(struct context_entry *context)
128 {
129         context->lo &= (((u64)-1) << 2) | 1;
130 }
131
132 static inline void context_set_translation_type(struct context_entry *context,
133                                                 unsigned long value)
134 {
135         context->lo &= (((u64)-1) << 4) | 3;
136         context->lo |= (value & 3) << 2;
137 }
138
139 static inline void context_set_address_root(struct context_entry *context,
140                                             unsigned long value)
141 {
142         context->lo |= value & VTD_PAGE_MASK;
143 }
144
145 static inline void context_set_address_width(struct context_entry *context,
146                                              unsigned long value)
147 {
148         context->hi |= value & 7;
149 }
150
151 static inline void context_set_domain_id(struct context_entry *context,
152                                          unsigned long value)
153 {
154         context->hi |= (value & ((1 << 16) - 1)) << 8;
155 }
156
157 static inline void context_clear_entry(struct context_entry *context)
158 {
159         context->lo = 0;
160         context->hi = 0;
161 }
162
163 /*
164  * 0: readable
165  * 1: writable
166  * 2-6: reserved
167  * 7: super page
168  * 8-10: available
169  * 11: snoop behavior
170  * 12-63: Host physcial address
171  */
172 struct dma_pte {
173         u64 val;
174 };
175
176 static inline void dma_clear_pte(struct dma_pte *pte)
177 {
178         pte->val = 0;
179 }
180
181 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_READ;
184 }
185
186 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 {
188         pte->val |= DMA_PTE_WRITE;
189 }
190
191 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 {
193         pte->val |= DMA_PTE_SNP;
194 }
195
196 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 {
198         pte->val = (pte->val & ~3) | (prot & 3);
199 }
200
201 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 {
203         return (pte->val & VTD_PAGE_MASK);
204 }
205
206 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 {
208         pte->val |= (addr & VTD_PAGE_MASK);
209 }
210
211 static inline bool dma_pte_present(struct dma_pte *pte)
212 {
213         return (pte->val & 3) != 0;
214 }
215
216 /* devices under the same p2p bridge are owned in one domain */
217 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218
219 /* domain represents a virtual machine, more than one devices
220  * across iommus may be owned in one domain, e.g. kvm guest.
221  */
222 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
223
224 struct dmar_domain {
225         int     id;                     /* domain id */
226         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
227
228         struct list_head devices;       /* all devices' list */
229         struct iova_domain iovad;       /* iova's that belong to this domain */
230
231         struct dma_pte  *pgd;           /* virtual address */
232         spinlock_t      mapping_lock;   /* page table lock */
233         int             gaw;            /* max guest address width */
234
235         /* adjusted guest address width, 0 is level 2 30-bit */
236         int             agaw;
237
238         int             flags;          /* flags to find out type of domain */
239
240         int             iommu_coherency;/* indicate coherency of iommu access */
241         int             iommu_snooping; /* indicate snooping control feature*/
242         int             iommu_count;    /* reference count of iommu */
243         spinlock_t      iommu_lock;     /* protect iommu set in domain */
244         u64             max_addr;       /* maximum mapped address */
245 };
246
247 /* PCI domain-device relationship */
248 struct device_domain_info {
249         struct list_head link;  /* link to domain siblings */
250         struct list_head global; /* link to global list */
251         int segment;            /* PCI domain */
252         u8 bus;                 /* PCI bus number */
253         u8 devfn;               /* PCI devfn number */
254         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
255         struct dmar_domain *domain; /* pointer to domain */
256 };
257
258 static void flush_unmaps_timeout(unsigned long data);
259
260 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
261
262 #define HIGH_WATER_MARK 250
263 struct deferred_flush_tables {
264         int next;
265         struct iova *iova[HIGH_WATER_MARK];
266         struct dmar_domain *domain[HIGH_WATER_MARK];
267 };
268
269 static struct deferred_flush_tables *deferred_flush;
270
271 /* bitmap for indexing intel_iommus */
272 static int g_num_of_iommus;
273
274 static DEFINE_SPINLOCK(async_umap_flush_lock);
275 static LIST_HEAD(unmaps_to_do);
276
277 static int timer_on;
278 static long list_size;
279
280 static void domain_remove_dev_info(struct dmar_domain *domain);
281
282 #ifdef CONFIG_DMAR_DEFAULT_ON
283 int dmar_disabled = 0;
284 #else
285 int dmar_disabled = 1;
286 #endif /*CONFIG_DMAR_DEFAULT_ON*/
287
288 static int __initdata dmar_map_gfx = 1;
289 static int dmar_forcedac;
290 static int intel_iommu_strict;
291
292 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
293 static DEFINE_SPINLOCK(device_domain_lock);
294 static LIST_HEAD(device_domain_list);
295
296 static struct iommu_ops intel_iommu_ops;
297
298 static int __init intel_iommu_setup(char *str)
299 {
300         if (!str)
301                 return -EINVAL;
302         while (*str) {
303                 if (!strncmp(str, "on", 2)) {
304                         dmar_disabled = 0;
305                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
306                 } else if (!strncmp(str, "off", 3)) {
307                         dmar_disabled = 1;
308                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
309                 } else if (!strncmp(str, "igfx_off", 8)) {
310                         dmar_map_gfx = 0;
311                         printk(KERN_INFO
312                                 "Intel-IOMMU: disable GFX device mapping\n");
313                 } else if (!strncmp(str, "forcedac", 8)) {
314                         printk(KERN_INFO
315                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
316                         dmar_forcedac = 1;
317                 } else if (!strncmp(str, "strict", 6)) {
318                         printk(KERN_INFO
319                                 "Intel-IOMMU: disable batched IOTLB flush\n");
320                         intel_iommu_strict = 1;
321                 }
322
323                 str += strcspn(str, ",");
324                 while (*str == ',')
325                         str++;
326         }
327         return 0;
328 }
329 __setup("intel_iommu=", intel_iommu_setup);
330
331 static struct kmem_cache *iommu_domain_cache;
332 static struct kmem_cache *iommu_devinfo_cache;
333 static struct kmem_cache *iommu_iova_cache;
334
335 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
336 {
337         unsigned int flags;
338         void *vaddr;
339
340         /* trying to avoid low memory issues */
341         flags = current->flags & PF_MEMALLOC;
342         current->flags |= PF_MEMALLOC;
343         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
344         current->flags &= (~PF_MEMALLOC | flags);
345         return vaddr;
346 }
347
348
349 static inline void *alloc_pgtable_page(void)
350 {
351         unsigned int flags;
352         void *vaddr;
353
354         /* trying to avoid low memory issues */
355         flags = current->flags & PF_MEMALLOC;
356         current->flags |= PF_MEMALLOC;
357         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
358         current->flags &= (~PF_MEMALLOC | flags);
359         return vaddr;
360 }
361
362 static inline void free_pgtable_page(void *vaddr)
363 {
364         free_page((unsigned long)vaddr);
365 }
366
367 static inline void *alloc_domain_mem(void)
368 {
369         return iommu_kmem_cache_alloc(iommu_domain_cache);
370 }
371
372 static void free_domain_mem(void *vaddr)
373 {
374         kmem_cache_free(iommu_domain_cache, vaddr);
375 }
376
377 static inline void * alloc_devinfo_mem(void)
378 {
379         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
380 }
381
382 static inline void free_devinfo_mem(void *vaddr)
383 {
384         kmem_cache_free(iommu_devinfo_cache, vaddr);
385 }
386
387 struct iova *alloc_iova_mem(void)
388 {
389         return iommu_kmem_cache_alloc(iommu_iova_cache);
390 }
391
392 void free_iova_mem(struct iova *iova)
393 {
394         kmem_cache_free(iommu_iova_cache, iova);
395 }
396
397
398 static inline int width_to_agaw(int width);
399
400 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
401 {
402         unsigned long sagaw;
403         int agaw = -1;
404
405         sagaw = cap_sagaw(iommu->cap);
406         for (agaw = width_to_agaw(max_gaw);
407              agaw >= 0; agaw--) {
408                 if (test_bit(agaw, &sagaw))
409                         break;
410         }
411
412         return agaw;
413 }
414
415 /*
416  * Calculate max SAGAW for each iommu.
417  */
418 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
419 {
420         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
421 }
422
423 /*
424  * calculate agaw for each iommu.
425  * "SAGAW" may be different across iommus, use a default agaw, and
426  * get a supported less agaw for iommus that don't support the default agaw.
427  */
428 int iommu_calculate_agaw(struct intel_iommu *iommu)
429 {
430         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
431 }
432
433 /* in native case, each domain is related to only one iommu */
434 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
435 {
436         int iommu_id;
437
438         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
439
440         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
441         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
442                 return NULL;
443
444         return g_iommus[iommu_id];
445 }
446
447 static void domain_update_iommu_coherency(struct dmar_domain *domain)
448 {
449         int i;
450
451         domain->iommu_coherency = 1;
452
453         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454         for (; i < g_num_of_iommus; ) {
455                 if (!ecap_coherent(g_iommus[i]->ecap)) {
456                         domain->iommu_coherency = 0;
457                         break;
458                 }
459                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
460         }
461 }
462
463 static void domain_update_iommu_snooping(struct dmar_domain *domain)
464 {
465         int i;
466
467         domain->iommu_snooping = 1;
468
469         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
470         for (; i < g_num_of_iommus; ) {
471                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
472                         domain->iommu_snooping = 0;
473                         break;
474                 }
475                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
476         }
477 }
478
479 /* Some capabilities may be different across iommus */
480 static void domain_update_iommu_cap(struct dmar_domain *domain)
481 {
482         domain_update_iommu_coherency(domain);
483         domain_update_iommu_snooping(domain);
484 }
485
486 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
487 {
488         struct dmar_drhd_unit *drhd = NULL;
489         int i;
490
491         for_each_drhd_unit(drhd) {
492                 if (drhd->ignored)
493                         continue;
494                 if (segment != drhd->segment)
495                         continue;
496
497                 for (i = 0; i < drhd->devices_cnt; i++) {
498                         if (drhd->devices[i] &&
499                             drhd->devices[i]->bus->number == bus &&
500                             drhd->devices[i]->devfn == devfn)
501                                 return drhd->iommu;
502                         if (drhd->devices[i] &&
503                             drhd->devices[i]->subordinate &&
504                             drhd->devices[i]->subordinate->number <= bus &&
505                             drhd->devices[i]->subordinate->subordinate >= bus)
506                                 return drhd->iommu;
507                 }
508
509                 if (drhd->include_all)
510                         return drhd->iommu;
511         }
512
513         return NULL;
514 }
515
516 static void domain_flush_cache(struct dmar_domain *domain,
517                                void *addr, int size)
518 {
519         if (!domain->iommu_coherency)
520                 clflush_cache_range(addr, size);
521 }
522
523 /* Gets context entry for a given bus and devfn */
524 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
525                 u8 bus, u8 devfn)
526 {
527         struct root_entry *root;
528         struct context_entry *context;
529         unsigned long phy_addr;
530         unsigned long flags;
531
532         spin_lock_irqsave(&iommu->lock, flags);
533         root = &iommu->root_entry[bus];
534         context = get_context_addr_from_root(root);
535         if (!context) {
536                 context = (struct context_entry *)alloc_pgtable_page();
537                 if (!context) {
538                         spin_unlock_irqrestore(&iommu->lock, flags);
539                         return NULL;
540                 }
541                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
542                 phy_addr = virt_to_phys((void *)context);
543                 set_root_value(root, phy_addr);
544                 set_root_present(root);
545                 __iommu_flush_cache(iommu, root, sizeof(*root));
546         }
547         spin_unlock_irqrestore(&iommu->lock, flags);
548         return &context[devfn];
549 }
550
551 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
552 {
553         struct root_entry *root;
554         struct context_entry *context;
555         int ret;
556         unsigned long flags;
557
558         spin_lock_irqsave(&iommu->lock, flags);
559         root = &iommu->root_entry[bus];
560         context = get_context_addr_from_root(root);
561         if (!context) {
562                 ret = 0;
563                 goto out;
564         }
565         ret = context_present(&context[devfn]);
566 out:
567         spin_unlock_irqrestore(&iommu->lock, flags);
568         return ret;
569 }
570
571 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
572 {
573         struct root_entry *root;
574         struct context_entry *context;
575         unsigned long flags;
576
577         spin_lock_irqsave(&iommu->lock, flags);
578         root = &iommu->root_entry[bus];
579         context = get_context_addr_from_root(root);
580         if (context) {
581                 context_clear_entry(&context[devfn]);
582                 __iommu_flush_cache(iommu, &context[devfn], \
583                         sizeof(*context));
584         }
585         spin_unlock_irqrestore(&iommu->lock, flags);
586 }
587
588 static void free_context_table(struct intel_iommu *iommu)
589 {
590         struct root_entry *root;
591         int i;
592         unsigned long flags;
593         struct context_entry *context;
594
595         spin_lock_irqsave(&iommu->lock, flags);
596         if (!iommu->root_entry) {
597                 goto out;
598         }
599         for (i = 0; i < ROOT_ENTRY_NR; i++) {
600                 root = &iommu->root_entry[i];
601                 context = get_context_addr_from_root(root);
602                 if (context)
603                         free_pgtable_page(context);
604         }
605         free_pgtable_page(iommu->root_entry);
606         iommu->root_entry = NULL;
607 out:
608         spin_unlock_irqrestore(&iommu->lock, flags);
609 }
610
611 /* page table handling */
612 #define LEVEL_STRIDE            (9)
613 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
614
615 static inline int agaw_to_level(int agaw)
616 {
617         return agaw + 2;
618 }
619
620 static inline int agaw_to_width(int agaw)
621 {
622         return 30 + agaw * LEVEL_STRIDE;
623
624 }
625
626 static inline int width_to_agaw(int width)
627 {
628         return (width - 30) / LEVEL_STRIDE;
629 }
630
631 static inline unsigned int level_to_offset_bits(int level)
632 {
633         return (12 + (level - 1) * LEVEL_STRIDE);
634 }
635
636 static inline int address_level_offset(u64 addr, int level)
637 {
638         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
639 }
640
641 static inline u64 level_mask(int level)
642 {
643         return ((u64)-1 << level_to_offset_bits(level));
644 }
645
646 static inline u64 level_size(int level)
647 {
648         return ((u64)1 << level_to_offset_bits(level));
649 }
650
651 static inline u64 align_to_level(u64 addr, int level)
652 {
653         return ((addr + level_size(level) - 1) & level_mask(level));
654 }
655
656 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
657 {
658         int addr_width = agaw_to_width(domain->agaw);
659         struct dma_pte *parent, *pte = NULL;
660         int level = agaw_to_level(domain->agaw);
661         int offset;
662         unsigned long flags;
663
664         BUG_ON(!domain->pgd);
665
666         addr &= (((u64)1) << addr_width) - 1;
667         parent = domain->pgd;
668
669         spin_lock_irqsave(&domain->mapping_lock, flags);
670         while (level > 0) {
671                 void *tmp_page;
672
673                 offset = address_level_offset(addr, level);
674                 pte = &parent[offset];
675                 if (level == 1)
676                         break;
677
678                 if (!dma_pte_present(pte)) {
679                         tmp_page = alloc_pgtable_page();
680
681                         if (!tmp_page) {
682                                 spin_unlock_irqrestore(&domain->mapping_lock,
683                                         flags);
684                                 return NULL;
685                         }
686                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
687                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
688                         /*
689                          * high level table always sets r/w, last level page
690                          * table control read/write
691                          */
692                         dma_set_pte_readable(pte);
693                         dma_set_pte_writable(pte);
694                         domain_flush_cache(domain, pte, sizeof(*pte));
695                 }
696                 parent = phys_to_virt(dma_pte_addr(pte));
697                 level--;
698         }
699
700         spin_unlock_irqrestore(&domain->mapping_lock, flags);
701         return pte;
702 }
703
704 /* return address's pte at specific level */
705 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
706                 int level)
707 {
708         struct dma_pte *parent, *pte = NULL;
709         int total = agaw_to_level(domain->agaw);
710         int offset;
711
712         parent = domain->pgd;
713         while (level <= total) {
714                 offset = address_level_offset(addr, total);
715                 pte = &parent[offset];
716                 if (level == total)
717                         return pte;
718
719                 if (!dma_pte_present(pte))
720                         break;
721                 parent = phys_to_virt(dma_pte_addr(pte));
722                 total--;
723         }
724         return NULL;
725 }
726
727 /* clear one page's page table */
728 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
729 {
730         struct dma_pte *pte = NULL;
731
732         /* get last level pte */
733         pte = dma_addr_level_pte(domain, addr, 1);
734
735         if (pte) {
736                 dma_clear_pte(pte);
737                 domain_flush_cache(domain, pte, sizeof(*pte));
738         }
739 }
740
741 /* clear last level pte, a tlb flush should be followed */
742 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
743 {
744         int addr_width = agaw_to_width(domain->agaw);
745         int npages;
746
747         start &= (((u64)1) << addr_width) - 1;
748         end &= (((u64)1) << addr_width) - 1;
749         /* in case it's partial page */
750         start &= PAGE_MASK;
751         end = PAGE_ALIGN(end);
752         npages = (end - start) / VTD_PAGE_SIZE;
753
754         /* we don't need lock here, nobody else touches the iova range */
755         while (npages--) {
756                 dma_pte_clear_one(domain, start);
757                 start += VTD_PAGE_SIZE;
758         }
759 }
760
761 /* free page table pages. last level pte should already be cleared */
762 static void dma_pte_free_pagetable(struct dmar_domain *domain,
763         u64 start, u64 end)
764 {
765         int addr_width = agaw_to_width(domain->agaw);
766         struct dma_pte *pte;
767         int total = agaw_to_level(domain->agaw);
768         int level;
769         u64 tmp;
770
771         start &= (((u64)1) << addr_width) - 1;
772         end &= (((u64)1) << addr_width) - 1;
773
774         /* we don't need lock here, nobody else touches the iova range */
775         level = 2;
776         while (level <= total) {
777                 tmp = align_to_level(start, level);
778                 if (tmp >= end || (tmp + level_size(level) > end))
779                         return;
780
781                 while (tmp < end) {
782                         pte = dma_addr_level_pte(domain, tmp, level);
783                         if (pte) {
784                                 free_pgtable_page(
785                                         phys_to_virt(dma_pte_addr(pte)));
786                                 dma_clear_pte(pte);
787                                 domain_flush_cache(domain, pte, sizeof(*pte));
788                         }
789                         tmp += level_size(level);
790                 }
791                 level++;
792         }
793         /* free pgd */
794         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
795                 free_pgtable_page(domain->pgd);
796                 domain->pgd = NULL;
797         }
798 }
799
800 /* iommu handling */
801 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
802 {
803         struct root_entry *root;
804         unsigned long flags;
805
806         root = (struct root_entry *)alloc_pgtable_page();
807         if (!root)
808                 return -ENOMEM;
809
810         __iommu_flush_cache(iommu, root, ROOT_SIZE);
811
812         spin_lock_irqsave(&iommu->lock, flags);
813         iommu->root_entry = root;
814         spin_unlock_irqrestore(&iommu->lock, flags);
815
816         return 0;
817 }
818
819 static void iommu_set_root_entry(struct intel_iommu *iommu)
820 {
821         void *addr;
822         u32 cmd, sts;
823         unsigned long flag;
824
825         addr = iommu->root_entry;
826
827         spin_lock_irqsave(&iommu->register_lock, flag);
828         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
829
830         cmd = iommu->gcmd | DMA_GCMD_SRTP;
831         writel(cmd, iommu->reg + DMAR_GCMD_REG);
832
833         /* Make sure hardware complete it */
834         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
835                 readl, (sts & DMA_GSTS_RTPS), sts);
836
837         spin_unlock_irqrestore(&iommu->register_lock, flag);
838 }
839
840 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
841 {
842         u32 val;
843         unsigned long flag;
844
845         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
846                 return;
847         val = iommu->gcmd | DMA_GCMD_WBF;
848
849         spin_lock_irqsave(&iommu->register_lock, flag);
850         writel(val, iommu->reg + DMAR_GCMD_REG);
851
852         /* Make sure hardware complete it */
853         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
854                         readl, (!(val & DMA_GSTS_WBFS)), val);
855
856         spin_unlock_irqrestore(&iommu->register_lock, flag);
857 }
858
859 /* return value determine if we need a write buffer flush */
860 static void __iommu_flush_context(struct intel_iommu *iommu,
861                                   u16 did, u16 source_id, u8 function_mask,
862                                   u64 type)
863 {
864         u64 val = 0;
865         unsigned long flag;
866
867         switch (type) {
868         case DMA_CCMD_GLOBAL_INVL:
869                 val = DMA_CCMD_GLOBAL_INVL;
870                 break;
871         case DMA_CCMD_DOMAIN_INVL:
872                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
873                 break;
874         case DMA_CCMD_DEVICE_INVL:
875                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
876                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
877                 break;
878         default:
879                 BUG();
880         }
881         val |= DMA_CCMD_ICC;
882
883         spin_lock_irqsave(&iommu->register_lock, flag);
884         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
885
886         /* Make sure hardware complete it */
887         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
888                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
889
890         spin_unlock_irqrestore(&iommu->register_lock, flag);
891 }
892
893 /* return value determine if we need a write buffer flush */
894 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
895         u64 addr, unsigned int size_order, u64 type,
896         int non_present_entry_flush)
897 {
898         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
899         u64 val = 0, val_iva = 0;
900         unsigned long flag;
901
902         /*
903          * In the non-present entry flush case, if hardware doesn't cache
904          * non-present entry we do nothing and if hardware cache non-present
905          * entry, we flush entries of domain 0 (the domain id is used to cache
906          * any non-present entries)
907          */
908         if (non_present_entry_flush) {
909                 if (!cap_caching_mode(iommu->cap))
910                         return 1;
911                 else
912                         did = 0;
913         }
914
915         switch (type) {
916         case DMA_TLB_GLOBAL_FLUSH:
917                 /* global flush doesn't need set IVA_REG */
918                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
919                 break;
920         case DMA_TLB_DSI_FLUSH:
921                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
922                 break;
923         case DMA_TLB_PSI_FLUSH:
924                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
925                 /* Note: always flush non-leaf currently */
926                 val_iva = size_order | addr;
927                 break;
928         default:
929                 BUG();
930         }
931         /* Note: set drain read/write */
932 #if 0
933         /*
934          * This is probably to be super secure.. Looks like we can
935          * ignore it without any impact.
936          */
937         if (cap_read_drain(iommu->cap))
938                 val |= DMA_TLB_READ_DRAIN;
939 #endif
940         if (cap_write_drain(iommu->cap))
941                 val |= DMA_TLB_WRITE_DRAIN;
942
943         spin_lock_irqsave(&iommu->register_lock, flag);
944         /* Note: Only uses first TLB reg currently */
945         if (val_iva)
946                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
947         dmar_writeq(iommu->reg + tlb_offset + 8, val);
948
949         /* Make sure hardware complete it */
950         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
951                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
952
953         spin_unlock_irqrestore(&iommu->register_lock, flag);
954
955         /* check IOTLB invalidation granularity */
956         if (DMA_TLB_IAIG(val) == 0)
957                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
958         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
959                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
960                         (unsigned long long)DMA_TLB_IIRG(type),
961                         (unsigned long long)DMA_TLB_IAIG(val));
962         /* flush iotlb entry will implicitly flush write buffer */
963         return 0;
964 }
965
966 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
967         u64 addr, unsigned int pages, int non_present_entry_flush)
968 {
969         unsigned int mask;
970
971         BUG_ON(addr & (~VTD_PAGE_MASK));
972         BUG_ON(pages == 0);
973
974         /* Fallback to domain selective flush if no PSI support */
975         if (!cap_pgsel_inv(iommu->cap))
976                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
977                                                 DMA_TLB_DSI_FLUSH,
978                                                 non_present_entry_flush);
979
980         /*
981          * PSI requires page size to be 2 ^ x, and the base address is naturally
982          * aligned to the size
983          */
984         mask = ilog2(__roundup_pow_of_two(pages));
985         /* Fallback to domain selective flush if size is too big */
986         if (mask > cap_max_amask_val(iommu->cap))
987                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
988                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
989
990         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
991                                         DMA_TLB_PSI_FLUSH,
992                                         non_present_entry_flush);
993 }
994
995 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
996 {
997         u32 pmen;
998         unsigned long flags;
999
1000         spin_lock_irqsave(&iommu->register_lock, flags);
1001         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1002         pmen &= ~DMA_PMEN_EPM;
1003         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1004
1005         /* wait for the protected region status bit to clear */
1006         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1007                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1008
1009         spin_unlock_irqrestore(&iommu->register_lock, flags);
1010 }
1011
1012 static int iommu_enable_translation(struct intel_iommu *iommu)
1013 {
1014         u32 sts;
1015         unsigned long flags;
1016
1017         spin_lock_irqsave(&iommu->register_lock, flags);
1018         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1019
1020         /* Make sure hardware complete it */
1021         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1022                 readl, (sts & DMA_GSTS_TES), sts);
1023
1024         iommu->gcmd |= DMA_GCMD_TE;
1025         spin_unlock_irqrestore(&iommu->register_lock, flags);
1026         return 0;
1027 }
1028
1029 static int iommu_disable_translation(struct intel_iommu *iommu)
1030 {
1031         u32 sts;
1032         unsigned long flag;
1033
1034         spin_lock_irqsave(&iommu->register_lock, flag);
1035         iommu->gcmd &= ~DMA_GCMD_TE;
1036         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1037
1038         /* Make sure hardware complete it */
1039         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1040                 readl, (!(sts & DMA_GSTS_TES)), sts);
1041
1042         spin_unlock_irqrestore(&iommu->register_lock, flag);
1043         return 0;
1044 }
1045
1046
1047 static int iommu_init_domains(struct intel_iommu *iommu)
1048 {
1049         unsigned long ndomains;
1050         unsigned long nlongs;
1051
1052         ndomains = cap_ndoms(iommu->cap);
1053         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1054         nlongs = BITS_TO_LONGS(ndomains);
1055
1056         /* TBD: there might be 64K domains,
1057          * consider other allocation for future chip
1058          */
1059         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1060         if (!iommu->domain_ids) {
1061                 printk(KERN_ERR "Allocating domain id array failed\n");
1062                 return -ENOMEM;
1063         }
1064         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1065                         GFP_KERNEL);
1066         if (!iommu->domains) {
1067                 printk(KERN_ERR "Allocating domain array failed\n");
1068                 kfree(iommu->domain_ids);
1069                 return -ENOMEM;
1070         }
1071
1072         spin_lock_init(&iommu->lock);
1073
1074         /*
1075          * if Caching mode is set, then invalid translations are tagged
1076          * with domainid 0. Hence we need to pre-allocate it.
1077          */
1078         if (cap_caching_mode(iommu->cap))
1079                 set_bit(0, iommu->domain_ids);
1080         return 0;
1081 }
1082
1083
1084 static void domain_exit(struct dmar_domain *domain);
1085 static void vm_domain_exit(struct dmar_domain *domain);
1086
1087 void free_dmar_iommu(struct intel_iommu *iommu)
1088 {
1089         struct dmar_domain *domain;
1090         int i;
1091         unsigned long flags;
1092
1093         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1094         for (; i < cap_ndoms(iommu->cap); ) {
1095                 domain = iommu->domains[i];
1096                 clear_bit(i, iommu->domain_ids);
1097
1098                 spin_lock_irqsave(&domain->iommu_lock, flags);
1099                 if (--domain->iommu_count == 0) {
1100                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1101                                 vm_domain_exit(domain);
1102                         else
1103                                 domain_exit(domain);
1104                 }
1105                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1106
1107                 i = find_next_bit(iommu->domain_ids,
1108                         cap_ndoms(iommu->cap), i+1);
1109         }
1110
1111         if (iommu->gcmd & DMA_GCMD_TE)
1112                 iommu_disable_translation(iommu);
1113
1114         if (iommu->irq) {
1115                 set_irq_data(iommu->irq, NULL);
1116                 /* This will mask the irq */
1117                 free_irq(iommu->irq, iommu);
1118                 destroy_irq(iommu->irq);
1119         }
1120
1121         kfree(iommu->domains);
1122         kfree(iommu->domain_ids);
1123
1124         g_iommus[iommu->seq_id] = NULL;
1125
1126         /* if all iommus are freed, free g_iommus */
1127         for (i = 0; i < g_num_of_iommus; i++) {
1128                 if (g_iommus[i])
1129                         break;
1130         }
1131
1132         if (i == g_num_of_iommus)
1133                 kfree(g_iommus);
1134
1135         /* free context mapping */
1136         free_context_table(iommu);
1137 }
1138
1139 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1140 {
1141         unsigned long num;
1142         unsigned long ndomains;
1143         struct dmar_domain *domain;
1144         unsigned long flags;
1145
1146         domain = alloc_domain_mem();
1147         if (!domain)
1148                 return NULL;
1149
1150         ndomains = cap_ndoms(iommu->cap);
1151
1152         spin_lock_irqsave(&iommu->lock, flags);
1153         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1154         if (num >= ndomains) {
1155                 spin_unlock_irqrestore(&iommu->lock, flags);
1156                 free_domain_mem(domain);
1157                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1158                 return NULL;
1159         }
1160
1161         set_bit(num, iommu->domain_ids);
1162         domain->id = num;
1163         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1164         set_bit(iommu->seq_id, &domain->iommu_bmp);
1165         domain->flags = 0;
1166         iommu->domains[num] = domain;
1167         spin_unlock_irqrestore(&iommu->lock, flags);
1168
1169         return domain;
1170 }
1171
1172 static void iommu_free_domain(struct dmar_domain *domain)
1173 {
1174         unsigned long flags;
1175         struct intel_iommu *iommu;
1176
1177         iommu = domain_get_iommu(domain);
1178
1179         spin_lock_irqsave(&iommu->lock, flags);
1180         clear_bit(domain->id, iommu->domain_ids);
1181         spin_unlock_irqrestore(&iommu->lock, flags);
1182 }
1183
1184 static struct iova_domain reserved_iova_list;
1185 static struct lock_class_key reserved_alloc_key;
1186 static struct lock_class_key reserved_rbtree_key;
1187
1188 static void dmar_init_reserved_ranges(void)
1189 {
1190         struct pci_dev *pdev = NULL;
1191         struct iova *iova;
1192         int i;
1193         u64 addr, size;
1194
1195         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1196
1197         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1198                 &reserved_alloc_key);
1199         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1200                 &reserved_rbtree_key);
1201
1202         /* IOAPIC ranges shouldn't be accessed by DMA */
1203         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1204                 IOVA_PFN(IOAPIC_RANGE_END));
1205         if (!iova)
1206                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1207
1208         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1209         for_each_pci_dev(pdev) {
1210                 struct resource *r;
1211
1212                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1213                         r = &pdev->resource[i];
1214                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1215                                 continue;
1216                         addr = r->start;
1217                         addr &= PAGE_MASK;
1218                         size = r->end - addr;
1219                         size = PAGE_ALIGN(size);
1220                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1221                                 IOVA_PFN(size + addr) - 1);
1222                         if (!iova)
1223                                 printk(KERN_ERR "Reserve iova failed\n");
1224                 }
1225         }
1226
1227 }
1228
1229 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1230 {
1231         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1232 }
1233
1234 static inline int guestwidth_to_adjustwidth(int gaw)
1235 {
1236         int agaw;
1237         int r = (gaw - 12) % 9;
1238
1239         if (r == 0)
1240                 agaw = gaw;
1241         else
1242                 agaw = gaw + 9 - r;
1243         if (agaw > 64)
1244                 agaw = 64;
1245         return agaw;
1246 }
1247
1248 static int domain_init(struct dmar_domain *domain, int guest_width)
1249 {
1250         struct intel_iommu *iommu;
1251         int adjust_width, agaw;
1252         unsigned long sagaw;
1253
1254         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1255         spin_lock_init(&domain->mapping_lock);
1256         spin_lock_init(&domain->iommu_lock);
1257
1258         domain_reserve_special_ranges(domain);
1259
1260         /* calculate AGAW */
1261         iommu = domain_get_iommu(domain);
1262         if (guest_width > cap_mgaw(iommu->cap))
1263                 guest_width = cap_mgaw(iommu->cap);
1264         domain->gaw = guest_width;
1265         adjust_width = guestwidth_to_adjustwidth(guest_width);
1266         agaw = width_to_agaw(adjust_width);
1267         sagaw = cap_sagaw(iommu->cap);
1268         if (!test_bit(agaw, &sagaw)) {
1269                 /* hardware doesn't support it, choose a bigger one */
1270                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1271                 agaw = find_next_bit(&sagaw, 5, agaw);
1272                 if (agaw >= 5)
1273                         return -ENODEV;
1274         }
1275         domain->agaw = agaw;
1276         INIT_LIST_HEAD(&domain->devices);
1277
1278         if (ecap_coherent(iommu->ecap))
1279                 domain->iommu_coherency = 1;
1280         else
1281                 domain->iommu_coherency = 0;
1282
1283         if (ecap_sc_support(iommu->ecap))
1284                 domain->iommu_snooping = 1;
1285         else
1286                 domain->iommu_snooping = 0;
1287
1288         domain->iommu_count = 1;
1289
1290         /* always allocate the top pgd */
1291         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1292         if (!domain->pgd)
1293                 return -ENOMEM;
1294         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1295         return 0;
1296 }
1297
1298 static void domain_exit(struct dmar_domain *domain)
1299 {
1300         u64 end;
1301
1302         /* Domain 0 is reserved, so dont process it */
1303         if (!domain)
1304                 return;
1305
1306         domain_remove_dev_info(domain);
1307         /* destroy iovas */
1308         put_iova_domain(&domain->iovad);
1309         end = DOMAIN_MAX_ADDR(domain->gaw);
1310         end = end & (~PAGE_MASK);
1311
1312         /* clear ptes */
1313         dma_pte_clear_range(domain, 0, end);
1314
1315         /* free page tables */
1316         dma_pte_free_pagetable(domain, 0, end);
1317
1318         iommu_free_domain(domain);
1319         free_domain_mem(domain);
1320 }
1321
1322 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1323                                  u8 bus, u8 devfn, int translation)
1324 {
1325         struct context_entry *context;
1326         unsigned long flags;
1327         struct intel_iommu *iommu;
1328         struct dma_pte *pgd;
1329         unsigned long num;
1330         unsigned long ndomains;
1331         int id;
1332         int agaw;
1333
1334         pr_debug("Set context mapping for %02x:%02x.%d\n",
1335                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1336
1337         BUG_ON(!domain->pgd);
1338         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1339                translation != CONTEXT_TT_MULTI_LEVEL);
1340
1341         iommu = device_to_iommu(segment, bus, devfn);
1342         if (!iommu)
1343                 return -ENODEV;
1344
1345         context = device_to_context_entry(iommu, bus, devfn);
1346         if (!context)
1347                 return -ENOMEM;
1348         spin_lock_irqsave(&iommu->lock, flags);
1349         if (context_present(context)) {
1350                 spin_unlock_irqrestore(&iommu->lock, flags);
1351                 return 0;
1352         }
1353
1354         id = domain->id;
1355         pgd = domain->pgd;
1356
1357         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1358                 int found = 0;
1359
1360                 /* find an available domain id for this device in iommu */
1361                 ndomains = cap_ndoms(iommu->cap);
1362                 num = find_first_bit(iommu->domain_ids, ndomains);
1363                 for (; num < ndomains; ) {
1364                         if (iommu->domains[num] == domain) {
1365                                 id = num;
1366                                 found = 1;
1367                                 break;
1368                         }
1369                         num = find_next_bit(iommu->domain_ids,
1370                                             cap_ndoms(iommu->cap), num+1);
1371                 }
1372
1373                 if (found == 0) {
1374                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1375                         if (num >= ndomains) {
1376                                 spin_unlock_irqrestore(&iommu->lock, flags);
1377                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1378                                 return -EFAULT;
1379                         }
1380
1381                         set_bit(num, iommu->domain_ids);
1382                         iommu->domains[num] = domain;
1383                         id = num;
1384                 }
1385
1386                 /* Skip top levels of page tables for
1387                  * iommu which has less agaw than default.
1388                  */
1389                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1390                         pgd = phys_to_virt(dma_pte_addr(pgd));
1391                         if (!dma_pte_present(pgd)) {
1392                                 spin_unlock_irqrestore(&iommu->lock, flags);
1393                                 return -ENOMEM;
1394                         }
1395                 }
1396         }
1397
1398         context_set_domain_id(context, id);
1399
1400         /*
1401          * In pass through mode, AW must be programmed to indicate the largest
1402          * AGAW value supported by hardware. And ASR is ignored by hardware.
1403          */
1404         if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
1405                 context_set_address_width(context, iommu->agaw);
1406                 context_set_address_root(context, virt_to_phys(pgd));
1407         } else
1408                 context_set_address_width(context, iommu->msagaw);
1409
1410         context_set_translation_type(context, translation);
1411         context_set_fault_enable(context);
1412         context_set_present(context);
1413         domain_flush_cache(domain, context, sizeof(*context));
1414
1415         /*
1416          * It's a non-present to present mapping. If hardware doesn't cache
1417          * non-present entry we only need to flush the write-buffer. If the
1418          * _does_ cache non-present entries, then it does so in the special
1419          * domain #0, which we have to flush:
1420          */
1421         if (cap_caching_mode(iommu->cap)) {
1422                 iommu->flush.flush_context(iommu, 0,
1423                                            (((u16)bus) << 8) | devfn,
1424                                            DMA_CCMD_MASK_NOBIT,
1425                                            DMA_CCMD_DEVICE_INVL);
1426                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1427         } else {
1428                 iommu_flush_write_buffer(iommu);
1429         }
1430         spin_unlock_irqrestore(&iommu->lock, flags);
1431
1432         spin_lock_irqsave(&domain->iommu_lock, flags);
1433         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1434                 domain->iommu_count++;
1435                 domain_update_iommu_cap(domain);
1436         }
1437         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1438         return 0;
1439 }
1440
1441 static int
1442 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1443                         int translation)
1444 {
1445         int ret;
1446         struct pci_dev *tmp, *parent;
1447
1448         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1449                                          pdev->bus->number, pdev->devfn,
1450                                          translation);
1451         if (ret)
1452                 return ret;
1453
1454         /* dependent device mapping */
1455         tmp = pci_find_upstream_pcie_bridge(pdev);
1456         if (!tmp)
1457                 return 0;
1458         /* Secondary interface's bus number and devfn 0 */
1459         parent = pdev->bus->self;
1460         while (parent != tmp) {
1461                 ret = domain_context_mapping_one(domain,
1462                                                  pci_domain_nr(parent->bus),
1463                                                  parent->bus->number,
1464                                                  parent->devfn, translation);
1465                 if (ret)
1466                         return ret;
1467                 parent = parent->bus->self;
1468         }
1469         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1470                 return domain_context_mapping_one(domain,
1471                                         pci_domain_nr(tmp->subordinate),
1472                                         tmp->subordinate->number, 0,
1473                                         translation);
1474         else /* this is a legacy PCI bridge */
1475                 return domain_context_mapping_one(domain,
1476                                                   pci_domain_nr(tmp->bus),
1477                                                   tmp->bus->number,
1478                                                   tmp->devfn,
1479                                                   translation);
1480 }
1481
1482 static int domain_context_mapped(struct pci_dev *pdev)
1483 {
1484         int ret;
1485         struct pci_dev *tmp, *parent;
1486         struct intel_iommu *iommu;
1487
1488         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1489                                 pdev->devfn);
1490         if (!iommu)
1491                 return -ENODEV;
1492
1493         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1494         if (!ret)
1495                 return ret;
1496         /* dependent device mapping */
1497         tmp = pci_find_upstream_pcie_bridge(pdev);
1498         if (!tmp)
1499                 return ret;
1500         /* Secondary interface's bus number and devfn 0 */
1501         parent = pdev->bus->self;
1502         while (parent != tmp) {
1503                 ret = device_context_mapped(iommu, parent->bus->number,
1504                                             parent->devfn);
1505                 if (!ret)
1506                         return ret;
1507                 parent = parent->bus->self;
1508         }
1509         if (tmp->is_pcie)
1510                 return device_context_mapped(iommu, tmp->subordinate->number,
1511                                              0);
1512         else
1513                 return device_context_mapped(iommu, tmp->bus->number,
1514                                              tmp->devfn);
1515 }
1516
1517 static int
1518 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1519                         u64 hpa, size_t size, int prot)
1520 {
1521         u64 start_pfn, end_pfn;
1522         struct dma_pte *pte;
1523         int index;
1524         int addr_width = agaw_to_width(domain->agaw);
1525
1526         hpa &= (((u64)1) << addr_width) - 1;
1527
1528         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1529                 return -EINVAL;
1530         iova &= PAGE_MASK;
1531         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1532         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1533         index = 0;
1534         while (start_pfn < end_pfn) {
1535                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1536                 if (!pte)
1537                         return -ENOMEM;
1538                 /* We don't need lock here, nobody else
1539                  * touches the iova range
1540                  */
1541                 BUG_ON(dma_pte_addr(pte));
1542                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1543                 dma_set_pte_prot(pte, prot);
1544                 if (prot & DMA_PTE_SNP)
1545                         dma_set_pte_snp(pte);
1546                 domain_flush_cache(domain, pte, sizeof(*pte));
1547                 start_pfn++;
1548                 index++;
1549         }
1550         return 0;
1551 }
1552
1553 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1554 {
1555         if (!iommu)
1556                 return;
1557
1558         clear_context_table(iommu, bus, devfn);
1559         iommu->flush.flush_context(iommu, 0, 0, 0,
1560                                            DMA_CCMD_GLOBAL_INVL);
1561         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1562                                          DMA_TLB_GLOBAL_FLUSH, 0);
1563 }
1564
1565 static void domain_remove_dev_info(struct dmar_domain *domain)
1566 {
1567         struct device_domain_info *info;
1568         unsigned long flags;
1569         struct intel_iommu *iommu;
1570
1571         spin_lock_irqsave(&device_domain_lock, flags);
1572         while (!list_empty(&domain->devices)) {
1573                 info = list_entry(domain->devices.next,
1574                         struct device_domain_info, link);
1575                 list_del(&info->link);
1576                 list_del(&info->global);
1577                 if (info->dev)
1578                         info->dev->dev.archdata.iommu = NULL;
1579                 spin_unlock_irqrestore(&device_domain_lock, flags);
1580
1581                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1582                 iommu_detach_dev(iommu, info->bus, info->devfn);
1583                 free_devinfo_mem(info);
1584
1585                 spin_lock_irqsave(&device_domain_lock, flags);
1586         }
1587         spin_unlock_irqrestore(&device_domain_lock, flags);
1588 }
1589
1590 /*
1591  * find_domain
1592  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1593  */
1594 static struct dmar_domain *
1595 find_domain(struct pci_dev *pdev)
1596 {
1597         struct device_domain_info *info;
1598
1599         /* No lock here, assumes no domain exit in normal case */
1600         info = pdev->dev.archdata.iommu;
1601         if (info)
1602                 return info->domain;
1603         return NULL;
1604 }
1605
1606 /* domain is initialized */
1607 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1608 {
1609         struct dmar_domain *domain, *found = NULL;
1610         struct intel_iommu *iommu;
1611         struct dmar_drhd_unit *drhd;
1612         struct device_domain_info *info, *tmp;
1613         struct pci_dev *dev_tmp;
1614         unsigned long flags;
1615         int bus = 0, devfn = 0;
1616         int segment;
1617
1618         domain = find_domain(pdev);
1619         if (domain)
1620                 return domain;
1621
1622         segment = pci_domain_nr(pdev->bus);
1623
1624         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1625         if (dev_tmp) {
1626                 if (dev_tmp->is_pcie) {
1627                         bus = dev_tmp->subordinate->number;
1628                         devfn = 0;
1629                 } else {
1630                         bus = dev_tmp->bus->number;
1631                         devfn = dev_tmp->devfn;
1632                 }
1633                 spin_lock_irqsave(&device_domain_lock, flags);
1634                 list_for_each_entry(info, &device_domain_list, global) {
1635                         if (info->segment == segment &&
1636                             info->bus == bus && info->devfn == devfn) {
1637                                 found = info->domain;
1638                                 break;
1639                         }
1640                 }
1641                 spin_unlock_irqrestore(&device_domain_lock, flags);
1642                 /* pcie-pci bridge already has a domain, uses it */
1643                 if (found) {
1644                         domain = found;
1645                         goto found_domain;
1646                 }
1647         }
1648
1649         /* Allocate new domain for the device */
1650         drhd = dmar_find_matched_drhd_unit(pdev);
1651         if (!drhd) {
1652                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1653                         pci_name(pdev));
1654                 return NULL;
1655         }
1656         iommu = drhd->iommu;
1657
1658         domain = iommu_alloc_domain(iommu);
1659         if (!domain)
1660                 goto error;
1661
1662         if (domain_init(domain, gaw)) {
1663                 domain_exit(domain);
1664                 goto error;
1665         }
1666
1667         /* register pcie-to-pci device */
1668         if (dev_tmp) {
1669                 info = alloc_devinfo_mem();
1670                 if (!info) {
1671                         domain_exit(domain);
1672                         goto error;
1673                 }
1674                 info->segment = segment;
1675                 info->bus = bus;
1676                 info->devfn = devfn;
1677                 info->dev = NULL;
1678                 info->domain = domain;
1679                 /* This domain is shared by devices under p2p bridge */
1680                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1681
1682                 /* pcie-to-pci bridge already has a domain, uses it */
1683                 found = NULL;
1684                 spin_lock_irqsave(&device_domain_lock, flags);
1685                 list_for_each_entry(tmp, &device_domain_list, global) {
1686                         if (tmp->segment == segment &&
1687                             tmp->bus == bus && tmp->devfn == devfn) {
1688                                 found = tmp->domain;
1689                                 break;
1690                         }
1691                 }
1692                 if (found) {
1693                         free_devinfo_mem(info);
1694                         domain_exit(domain);
1695                         domain = found;
1696                 } else {
1697                         list_add(&info->link, &domain->devices);
1698                         list_add(&info->global, &device_domain_list);
1699                 }
1700                 spin_unlock_irqrestore(&device_domain_lock, flags);
1701         }
1702
1703 found_domain:
1704         info = alloc_devinfo_mem();
1705         if (!info)
1706                 goto error;
1707         info->segment = segment;
1708         info->bus = pdev->bus->number;
1709         info->devfn = pdev->devfn;
1710         info->dev = pdev;
1711         info->domain = domain;
1712         spin_lock_irqsave(&device_domain_lock, flags);
1713         /* somebody is fast */
1714         found = find_domain(pdev);
1715         if (found != NULL) {
1716                 spin_unlock_irqrestore(&device_domain_lock, flags);
1717                 if (found != domain) {
1718                         domain_exit(domain);
1719                         domain = found;
1720                 }
1721                 free_devinfo_mem(info);
1722                 return domain;
1723         }
1724         list_add(&info->link, &domain->devices);
1725         list_add(&info->global, &device_domain_list);
1726         pdev->dev.archdata.iommu = info;
1727         spin_unlock_irqrestore(&device_domain_lock, flags);
1728         return domain;
1729 error:
1730         /* recheck it here, maybe others set it */
1731         return find_domain(pdev);
1732 }
1733
1734 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1735                                       unsigned long long start,
1736                                       unsigned long long end)
1737 {
1738         struct dmar_domain *domain;
1739         unsigned long size;
1740         unsigned long long base;
1741         int ret;
1742
1743         printk(KERN_INFO
1744                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1745                 pci_name(pdev), start, end);
1746         /* page table init */
1747         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1748         if (!domain)
1749                 return -ENOMEM;
1750
1751         /* The address might not be aligned */
1752         base = start & PAGE_MASK;
1753         size = end - base;
1754         size = PAGE_ALIGN(size);
1755         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1756                         IOVA_PFN(base + size) - 1)) {
1757                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1758                 ret = -ENOMEM;
1759                 goto error;
1760         }
1761
1762         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1763                 size, base, pci_name(pdev));
1764         /*
1765          * RMRR range might have overlap with physical memory range,
1766          * clear it first
1767          */
1768         dma_pte_clear_range(domain, base, base + size);
1769
1770         ret = domain_page_mapping(domain, base, base, size,
1771                 DMA_PTE_READ|DMA_PTE_WRITE);
1772         if (ret)
1773                 goto error;
1774
1775         /* context entry init */
1776         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
1777         if (!ret)
1778                 return 0;
1779 error:
1780         domain_exit(domain);
1781         return ret;
1782
1783 }
1784
1785 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1786         struct pci_dev *pdev)
1787 {
1788         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1789                 return 0;
1790         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1791                 rmrr->end_address + 1);
1792 }
1793
1794 #ifdef CONFIG_DMAR_GFX_WA
1795 struct iommu_prepare_data {
1796         struct pci_dev *pdev;
1797         int ret;
1798 };
1799
1800 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1801                                          unsigned long end_pfn, void *datax)
1802 {
1803         struct iommu_prepare_data *data;
1804
1805         data = (struct iommu_prepare_data *)datax;
1806
1807         data->ret = iommu_prepare_identity_map(data->pdev,
1808                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1809         return data->ret;
1810
1811 }
1812
1813 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1814 {
1815         int nid;
1816         struct iommu_prepare_data data;
1817
1818         data.pdev = pdev;
1819         data.ret = 0;
1820
1821         for_each_online_node(nid) {
1822                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1823                 if (data.ret)
1824                         return data.ret;
1825         }
1826         return data.ret;
1827 }
1828
1829 static void __init iommu_prepare_gfx_mapping(void)
1830 {
1831         struct pci_dev *pdev = NULL;
1832         int ret;
1833
1834         for_each_pci_dev(pdev) {
1835                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1836                                 !IS_GFX_DEVICE(pdev))
1837                         continue;
1838                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1839                         pci_name(pdev));
1840                 ret = iommu_prepare_with_active_regions(pdev);
1841                 if (ret)
1842                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1843         }
1844 }
1845 #else /* !CONFIG_DMAR_GFX_WA */
1846 static inline void iommu_prepare_gfx_mapping(void)
1847 {
1848         return;
1849 }
1850 #endif
1851
1852 #ifdef CONFIG_DMAR_FLOPPY_WA
1853 static inline void iommu_prepare_isa(void)
1854 {
1855         struct pci_dev *pdev;
1856         int ret;
1857
1858         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1859         if (!pdev)
1860                 return;
1861
1862         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1863         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1864
1865         if (ret)
1866                 printk(KERN_ERR "IOMMU: Failed to create 0-64M identity map, "
1867                         "floppy might not work\n");
1868
1869 }
1870 #else
1871 static inline void iommu_prepare_isa(void)
1872 {
1873         return;
1874 }
1875 #endif /* !CONFIG_DMAR_FLPY_WA */
1876
1877 /* Initialize each context entry as pass through.*/
1878 static int __init init_context_pass_through(void)
1879 {
1880         struct pci_dev *pdev = NULL;
1881         struct dmar_domain *domain;
1882         int ret;
1883
1884         for_each_pci_dev(pdev) {
1885                 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1886                 ret = domain_context_mapping(domain, pdev,
1887                                              CONTEXT_TT_PASS_THROUGH);
1888                 if (ret)
1889                         return ret;
1890         }
1891         return 0;
1892 }
1893
1894 static int __init init_dmars(void)
1895 {
1896         struct dmar_drhd_unit *drhd;
1897         struct dmar_rmrr_unit *rmrr;
1898         struct pci_dev *pdev;
1899         struct intel_iommu *iommu;
1900         int i, ret;
1901         int pass_through = 1;
1902
1903         /*
1904          * for each drhd
1905          *    allocate root
1906          *    initialize and program root entry to not present
1907          * endfor
1908          */
1909         for_each_drhd_unit(drhd) {
1910                 g_num_of_iommus++;
1911                 /*
1912                  * lock not needed as this is only incremented in the single
1913                  * threaded kernel __init code path all other access are read
1914                  * only
1915                  */
1916         }
1917
1918         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1919                         GFP_KERNEL);
1920         if (!g_iommus) {
1921                 printk(KERN_ERR "Allocating global iommu array failed\n");
1922                 ret = -ENOMEM;
1923                 goto error;
1924         }
1925
1926         deferred_flush = kzalloc(g_num_of_iommus *
1927                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1928         if (!deferred_flush) {
1929                 kfree(g_iommus);
1930                 ret = -ENOMEM;
1931                 goto error;
1932         }
1933
1934         for_each_drhd_unit(drhd) {
1935                 if (drhd->ignored)
1936                         continue;
1937
1938                 iommu = drhd->iommu;
1939                 g_iommus[iommu->seq_id] = iommu;
1940
1941                 ret = iommu_init_domains(iommu);
1942                 if (ret)
1943                         goto error;
1944
1945                 /*
1946                  * TBD:
1947                  * we could share the same root & context tables
1948                  * amoung all IOMMU's. Need to Split it later.
1949                  */
1950                 ret = iommu_alloc_root_entry(iommu);
1951                 if (ret) {
1952                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1953                         goto error;
1954                 }
1955                 if (!ecap_pass_through(iommu->ecap))
1956                         pass_through = 0;
1957         }
1958         if (iommu_pass_through)
1959                 if (!pass_through) {
1960                         printk(KERN_INFO
1961                                "Pass Through is not supported by hardware.\n");
1962                         iommu_pass_through = 0;
1963                 }
1964
1965         /*
1966          * Start from the sane iommu hardware state.
1967          */
1968         for_each_drhd_unit(drhd) {
1969                 if (drhd->ignored)
1970                         continue;
1971
1972                 iommu = drhd->iommu;
1973
1974                 /*
1975                  * If the queued invalidation is already initialized by us
1976                  * (for example, while enabling interrupt-remapping) then
1977                  * we got the things already rolling from a sane state.
1978                  */
1979                 if (iommu->qi)
1980                         continue;
1981
1982                 /*
1983                  * Clear any previous faults.
1984                  */
1985                 dmar_fault(-1, iommu);
1986                 /*
1987                  * Disable queued invalidation if supported and already enabled
1988                  * before OS handover.
1989                  */
1990                 dmar_disable_qi(iommu);
1991         }
1992
1993         for_each_drhd_unit(drhd) {
1994                 if (drhd->ignored)
1995                         continue;
1996
1997                 iommu = drhd->iommu;
1998
1999                 if (dmar_enable_qi(iommu)) {
2000                         /*
2001                          * Queued Invalidate not enabled, use Register Based
2002                          * Invalidate
2003                          */
2004                         iommu->flush.flush_context = __iommu_flush_context;
2005                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2006                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2007                                "invalidation\n",
2008                                (unsigned long long)drhd->reg_base_addr);
2009                 } else {
2010                         iommu->flush.flush_context = qi_flush_context;
2011                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2012                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2013                                "invalidation\n",
2014                                (unsigned long long)drhd->reg_base_addr);
2015                 }
2016         }
2017
2018 #ifdef CONFIG_INTR_REMAP
2019         if (!intr_remapping_enabled) {
2020                 ret = enable_intr_remapping(0);
2021                 if (ret)
2022                         printk(KERN_ERR
2023                                "IOMMU: enable interrupt remapping failed\n");
2024         }
2025 #endif
2026         /*
2027          * If pass through is set and enabled, context entries of all pci
2028          * devices are intialized by pass through translation type.
2029          */
2030         if (iommu_pass_through) {
2031                 ret = init_context_pass_through();
2032                 if (ret) {
2033                         printk(KERN_ERR "IOMMU: Pass through init failed.\n");
2034                         iommu_pass_through = 0;
2035                 }
2036         }
2037
2038         /*
2039          * If pass through is not set or not enabled, setup context entries for
2040          * identity mappings for rmrr, gfx, and isa.
2041          */
2042         if (!iommu_pass_through) {
2043                 /*
2044                  * For each rmrr
2045                  *   for each dev attached to rmrr
2046                  *   do
2047                  *     locate drhd for dev, alloc domain for dev
2048                  *     allocate free domain
2049                  *     allocate page table entries for rmrr
2050                  *     if context not allocated for bus
2051                  *           allocate and init context
2052                  *           set present in root table for this bus
2053                  *     init context with domain, translation etc
2054                  *    endfor
2055                  * endfor
2056                  */
2057                 for_each_rmrr_units(rmrr) {
2058                         for (i = 0; i < rmrr->devices_cnt; i++) {
2059                                 pdev = rmrr->devices[i];
2060                                 /*
2061                                  * some BIOS lists non-exist devices in DMAR
2062                                  * table.
2063                                  */
2064                                 if (!pdev)
2065                                         continue;
2066                                 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2067                                 if (ret)
2068                                         printk(KERN_ERR
2069                                  "IOMMU: mapping reserved region failed\n");
2070                         }
2071                 }
2072
2073                 iommu_prepare_gfx_mapping();
2074
2075                 iommu_prepare_isa();
2076         }
2077
2078         /*
2079          * for each drhd
2080          *   enable fault log
2081          *   global invalidate context cache
2082          *   global invalidate iotlb
2083          *   enable translation
2084          */
2085         for_each_drhd_unit(drhd) {
2086                 if (drhd->ignored)
2087                         continue;
2088                 iommu = drhd->iommu;
2089
2090                 iommu_flush_write_buffer(iommu);
2091
2092                 ret = dmar_set_interrupt(iommu);
2093                 if (ret)
2094                         goto error;
2095
2096                 iommu_set_root_entry(iommu);
2097
2098                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2099                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2100                                          0);
2101                 iommu_disable_protect_mem_regions(iommu);
2102
2103                 ret = iommu_enable_translation(iommu);
2104                 if (ret)
2105                         goto error;
2106         }
2107
2108         return 0;
2109 error:
2110         for_each_drhd_unit(drhd) {
2111                 if (drhd->ignored)
2112                         continue;
2113                 iommu = drhd->iommu;
2114                 free_iommu(iommu);
2115         }
2116         kfree(g_iommus);
2117         return ret;
2118 }
2119
2120 static inline u64 aligned_size(u64 host_addr, size_t size)
2121 {
2122         u64 addr;
2123         addr = (host_addr & (~PAGE_MASK)) + size;
2124         return PAGE_ALIGN(addr);
2125 }
2126
2127 struct iova *
2128 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2129 {
2130         struct iova *piova;
2131
2132         /* Make sure it's in range */
2133         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2134         if (!size || (IOVA_START_ADDR + size > end))
2135                 return NULL;
2136
2137         piova = alloc_iova(&domain->iovad,
2138                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2139         return piova;
2140 }
2141
2142 static struct iova *
2143 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2144                    size_t size, u64 dma_mask)
2145 {
2146         struct pci_dev *pdev = to_pci_dev(dev);
2147         struct iova *iova = NULL;
2148
2149         if (dma_mask <= DMA_BIT_MASK(32) || dmar_forcedac)
2150                 iova = iommu_alloc_iova(domain, size, dma_mask);
2151         else {
2152                 /*
2153                  * First try to allocate an io virtual address in
2154                  * DMA_BIT_MASK(32) and if that fails then try allocating
2155                  * from higher range
2156                  */
2157                 iova = iommu_alloc_iova(domain, size, DMA_BIT_MASK(32));
2158                 if (!iova)
2159                         iova = iommu_alloc_iova(domain, size, dma_mask);
2160         }
2161
2162         if (!iova) {
2163                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2164                 return NULL;
2165         }
2166
2167         return iova;
2168 }
2169
2170 static struct dmar_domain *
2171 get_valid_domain_for_dev(struct pci_dev *pdev)
2172 {
2173         struct dmar_domain *domain;
2174         int ret;
2175
2176         domain = get_domain_for_dev(pdev,
2177                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2178         if (!domain) {
2179                 printk(KERN_ERR
2180                         "Allocating domain for %s failed", pci_name(pdev));
2181                 return NULL;
2182         }
2183
2184         /* make sure context mapping is ok */
2185         if (unlikely(!domain_context_mapped(pdev))) {
2186                 ret = domain_context_mapping(domain, pdev,
2187                                              CONTEXT_TT_MULTI_LEVEL);
2188                 if (ret) {
2189                         printk(KERN_ERR
2190                                 "Domain context map for %s failed",
2191                                 pci_name(pdev));
2192                         return NULL;
2193                 }
2194         }
2195
2196         return domain;
2197 }
2198
2199 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2200                                      size_t size, int dir, u64 dma_mask)
2201 {
2202         struct pci_dev *pdev = to_pci_dev(hwdev);
2203         struct dmar_domain *domain;
2204         phys_addr_t start_paddr;
2205         struct iova *iova;
2206         int prot = 0;
2207         int ret;
2208         struct intel_iommu *iommu;
2209
2210         BUG_ON(dir == DMA_NONE);
2211         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2212                 return paddr;
2213
2214         domain = get_valid_domain_for_dev(pdev);
2215         if (!domain)
2216                 return 0;
2217
2218         iommu = domain_get_iommu(domain);
2219         size = aligned_size((u64)paddr, size);
2220
2221         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2222         if (!iova)
2223                 goto error;
2224
2225         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2226
2227         /*
2228          * Check if DMAR supports zero-length reads on write only
2229          * mappings..
2230          */
2231         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2232                         !cap_zlr(iommu->cap))
2233                 prot |= DMA_PTE_READ;
2234         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2235                 prot |= DMA_PTE_WRITE;
2236         /*
2237          * paddr - (paddr + size) might be partial page, we should map the whole
2238          * page.  Note: if two part of one page are separately mapped, we
2239          * might have two guest_addr mapping to the same host paddr, but this
2240          * is not a big problem
2241          */
2242         ret = domain_page_mapping(domain, start_paddr,
2243                 ((u64)paddr) & PAGE_MASK, size, prot);
2244         if (ret)
2245                 goto error;
2246
2247         /* it's a non-present to present mapping */
2248         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2249                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2250         if (ret)
2251                 iommu_flush_write_buffer(iommu);
2252
2253         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2254
2255 error:
2256         if (iova)
2257                 __free_iova(&domain->iovad, iova);
2258         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2259                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2260         return 0;
2261 }
2262
2263 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2264                                  unsigned long offset, size_t size,
2265                                  enum dma_data_direction dir,
2266                                  struct dma_attrs *attrs)
2267 {
2268         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2269                                   dir, to_pci_dev(dev)->dma_mask);
2270 }
2271
2272 static void flush_unmaps(void)
2273 {
2274         int i, j;
2275
2276         timer_on = 0;
2277
2278         /* just flush them all */
2279         for (i = 0; i < g_num_of_iommus; i++) {
2280                 struct intel_iommu *iommu = g_iommus[i];
2281                 if (!iommu)
2282                         continue;
2283
2284                 if (deferred_flush[i].next) {
2285                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2286                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2287                         for (j = 0; j < deferred_flush[i].next; j++) {
2288                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2289                                                 deferred_flush[i].iova[j]);
2290                         }
2291                         deferred_flush[i].next = 0;
2292                 }
2293         }
2294
2295         list_size = 0;
2296 }
2297
2298 static void flush_unmaps_timeout(unsigned long data)
2299 {
2300         unsigned long flags;
2301
2302         spin_lock_irqsave(&async_umap_flush_lock, flags);
2303         flush_unmaps();
2304         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2305 }
2306
2307 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2308 {
2309         unsigned long flags;
2310         int next, iommu_id;
2311         struct intel_iommu *iommu;
2312
2313         spin_lock_irqsave(&async_umap_flush_lock, flags);
2314         if (list_size == HIGH_WATER_MARK)
2315                 flush_unmaps();
2316
2317         iommu = domain_get_iommu(dom);
2318         iommu_id = iommu->seq_id;
2319
2320         next = deferred_flush[iommu_id].next;
2321         deferred_flush[iommu_id].domain[next] = dom;
2322         deferred_flush[iommu_id].iova[next] = iova;
2323         deferred_flush[iommu_id].next++;
2324
2325         if (!timer_on) {
2326                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2327                 timer_on = 1;
2328         }
2329         list_size++;
2330         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2331 }
2332
2333 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2334                              size_t size, enum dma_data_direction dir,
2335                              struct dma_attrs *attrs)
2336 {
2337         struct pci_dev *pdev = to_pci_dev(dev);
2338         struct dmar_domain *domain;
2339         unsigned long start_addr;
2340         struct iova *iova;
2341         struct intel_iommu *iommu;
2342
2343         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2344                 return;
2345         domain = find_domain(pdev);
2346         BUG_ON(!domain);
2347
2348         iommu = domain_get_iommu(domain);
2349
2350         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2351         if (!iova)
2352                 return;
2353
2354         start_addr = iova->pfn_lo << PAGE_SHIFT;
2355         size = aligned_size((u64)dev_addr, size);
2356
2357         pr_debug("Device %s unmapping: %zx@%llx\n",
2358                 pci_name(pdev), size, (unsigned long long)start_addr);
2359
2360         /*  clear the whole page */
2361         dma_pte_clear_range(domain, start_addr, start_addr + size);
2362         /* free page tables */
2363         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2364         if (intel_iommu_strict) {
2365                 if (iommu_flush_iotlb_psi(iommu,
2366                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2367                         iommu_flush_write_buffer(iommu);
2368                 /* free iova */
2369                 __free_iova(&domain->iovad, iova);
2370         } else {
2371                 add_unmap(domain, iova);
2372                 /*
2373                  * queue up the release of the unmap to save the 1/6th of the
2374                  * cpu used up by the iotlb flush operation...
2375                  */
2376         }
2377 }
2378
2379 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2380                                int dir)
2381 {
2382         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2383 }
2384
2385 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2386                                   dma_addr_t *dma_handle, gfp_t flags)
2387 {
2388         void *vaddr;
2389         int order;
2390
2391         size = PAGE_ALIGN(size);
2392         order = get_order(size);
2393         flags &= ~(GFP_DMA | GFP_DMA32);
2394
2395         vaddr = (void *)__get_free_pages(flags, order);
2396         if (!vaddr)
2397                 return NULL;
2398         memset(vaddr, 0, size);
2399
2400         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2401                                          DMA_BIDIRECTIONAL,
2402                                          hwdev->coherent_dma_mask);
2403         if (*dma_handle)
2404                 return vaddr;
2405         free_pages((unsigned long)vaddr, order);
2406         return NULL;
2407 }
2408
2409 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2410                                 dma_addr_t dma_handle)
2411 {
2412         int order;
2413
2414         size = PAGE_ALIGN(size);
2415         order = get_order(size);
2416
2417         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2418         free_pages((unsigned long)vaddr, order);
2419 }
2420
2421 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2422                            int nelems, enum dma_data_direction dir,
2423                            struct dma_attrs *attrs)
2424 {
2425         int i;
2426         struct pci_dev *pdev = to_pci_dev(hwdev);
2427         struct dmar_domain *domain;
2428         unsigned long start_addr;
2429         struct iova *iova;
2430         size_t size = 0;
2431         phys_addr_t addr;
2432         struct scatterlist *sg;
2433         struct intel_iommu *iommu;
2434
2435         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2436                 return;
2437
2438         domain = find_domain(pdev);
2439         BUG_ON(!domain);
2440
2441         iommu = domain_get_iommu(domain);
2442
2443         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2444         if (!iova)
2445                 return;
2446         for_each_sg(sglist, sg, nelems, i) {
2447                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2448                 size += aligned_size((u64)addr, sg->length);
2449         }
2450
2451         start_addr = iova->pfn_lo << PAGE_SHIFT;
2452
2453         /*  clear the whole page */
2454         dma_pte_clear_range(domain, start_addr, start_addr + size);
2455         /* free page tables */
2456         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2457
2458         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2459                         size >> VTD_PAGE_SHIFT, 0))
2460                 iommu_flush_write_buffer(iommu);
2461
2462         /* free iova */
2463         __free_iova(&domain->iovad, iova);
2464 }
2465
2466 static int intel_nontranslate_map_sg(struct device *hddev,
2467         struct scatterlist *sglist, int nelems, int dir)
2468 {
2469         int i;
2470         struct scatterlist *sg;
2471
2472         for_each_sg(sglist, sg, nelems, i) {
2473                 BUG_ON(!sg_page(sg));
2474                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2475                 sg->dma_length = sg->length;
2476         }
2477         return nelems;
2478 }
2479
2480 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2481                         enum dma_data_direction dir, struct dma_attrs *attrs)
2482 {
2483         phys_addr_t addr;
2484         int i;
2485         struct pci_dev *pdev = to_pci_dev(hwdev);
2486         struct dmar_domain *domain;
2487         size_t size = 0;
2488         int prot = 0;
2489         size_t offset = 0;
2490         struct iova *iova = NULL;
2491         int ret;
2492         struct scatterlist *sg;
2493         unsigned long start_addr;
2494         struct intel_iommu *iommu;
2495
2496         BUG_ON(dir == DMA_NONE);
2497         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2498                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2499
2500         domain = get_valid_domain_for_dev(pdev);
2501         if (!domain)
2502                 return 0;
2503
2504         iommu = domain_get_iommu(domain);
2505
2506         for_each_sg(sglist, sg, nelems, i) {
2507                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2508                 size += aligned_size((u64)addr, sg->length);
2509         }
2510
2511         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2512         if (!iova) {
2513                 sglist->dma_length = 0;
2514                 return 0;
2515         }
2516
2517         /*
2518          * Check if DMAR supports zero-length reads on write only
2519          * mappings..
2520          */
2521         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2522                         !cap_zlr(iommu->cap))
2523                 prot |= DMA_PTE_READ;
2524         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2525                 prot |= DMA_PTE_WRITE;
2526
2527         start_addr = iova->pfn_lo << PAGE_SHIFT;
2528         offset = 0;
2529         for_each_sg(sglist, sg, nelems, i) {
2530                 addr = page_to_phys(sg_page(sg)) + sg->offset;
2531                 size = aligned_size((u64)addr, sg->length);
2532                 ret = domain_page_mapping(domain, start_addr + offset,
2533                         ((u64)addr) & PAGE_MASK,
2534                         size, prot);
2535                 if (ret) {
2536                         /*  clear the page */
2537                         dma_pte_clear_range(domain, start_addr,
2538                                   start_addr + offset);
2539                         /* free page tables */
2540                         dma_pte_free_pagetable(domain, start_addr,
2541                                   start_addr + offset);
2542                         /* free iova */
2543                         __free_iova(&domain->iovad, iova);
2544                         return 0;
2545                 }
2546                 sg->dma_address = start_addr + offset +
2547                                 ((u64)addr & (~PAGE_MASK));
2548                 sg->dma_length = sg->length;
2549                 offset += size;
2550         }
2551
2552         /* it's a non-present to present mapping */
2553         if (iommu_flush_iotlb_psi(iommu, domain->id,
2554                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2555                 iommu_flush_write_buffer(iommu);
2556         return nelems;
2557 }
2558
2559 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2560 {
2561         return !dma_addr;
2562 }
2563
2564 struct dma_map_ops intel_dma_ops = {
2565         .alloc_coherent = intel_alloc_coherent,
2566         .free_coherent = intel_free_coherent,
2567         .map_sg = intel_map_sg,
2568         .unmap_sg = intel_unmap_sg,
2569         .map_page = intel_map_page,
2570         .unmap_page = intel_unmap_page,
2571         .mapping_error = intel_mapping_error,
2572 };
2573
2574 static inline int iommu_domain_cache_init(void)
2575 {
2576         int ret = 0;
2577
2578         iommu_domain_cache = kmem_cache_create("iommu_domain",
2579                                          sizeof(struct dmar_domain),
2580                                          0,
2581                                          SLAB_HWCACHE_ALIGN,
2582
2583                                          NULL);
2584         if (!iommu_domain_cache) {
2585                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2586                 ret = -ENOMEM;
2587         }
2588
2589         return ret;
2590 }
2591
2592 static inline int iommu_devinfo_cache_init(void)
2593 {
2594         int ret = 0;
2595
2596         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2597                                          sizeof(struct device_domain_info),
2598                                          0,
2599                                          SLAB_HWCACHE_ALIGN,
2600                                          NULL);
2601         if (!iommu_devinfo_cache) {
2602                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2603                 ret = -ENOMEM;
2604         }
2605
2606         return ret;
2607 }
2608
2609 static inline int iommu_iova_cache_init(void)
2610 {
2611         int ret = 0;
2612
2613         iommu_iova_cache = kmem_cache_create("iommu_iova",
2614                                          sizeof(struct iova),
2615                                          0,
2616                                          SLAB_HWCACHE_ALIGN,
2617                                          NULL);
2618         if (!iommu_iova_cache) {
2619                 printk(KERN_ERR "Couldn't create iova cache\n");
2620                 ret = -ENOMEM;
2621         }
2622
2623         return ret;
2624 }
2625
2626 static int __init iommu_init_mempool(void)
2627 {
2628         int ret;
2629         ret = iommu_iova_cache_init();
2630         if (ret)
2631                 return ret;
2632
2633         ret = iommu_domain_cache_init();
2634         if (ret)
2635                 goto domain_error;
2636
2637         ret = iommu_devinfo_cache_init();
2638         if (!ret)
2639                 return ret;
2640
2641         kmem_cache_destroy(iommu_domain_cache);
2642 domain_error:
2643         kmem_cache_destroy(iommu_iova_cache);
2644
2645         return -ENOMEM;
2646 }
2647
2648 static void __init iommu_exit_mempool(void)
2649 {
2650         kmem_cache_destroy(iommu_devinfo_cache);
2651         kmem_cache_destroy(iommu_domain_cache);
2652         kmem_cache_destroy(iommu_iova_cache);
2653
2654 }
2655
2656 static void __init init_no_remapping_devices(void)
2657 {
2658         struct dmar_drhd_unit *drhd;
2659
2660         for_each_drhd_unit(drhd) {
2661                 if (!drhd->include_all) {
2662                         int i;
2663                         for (i = 0; i < drhd->devices_cnt; i++)
2664                                 if (drhd->devices[i] != NULL)
2665                                         break;
2666                         /* ignore DMAR unit if no pci devices exist */
2667                         if (i == drhd->devices_cnt)
2668                                 drhd->ignored = 1;
2669                 }
2670         }
2671
2672         if (dmar_map_gfx)
2673                 return;
2674
2675         for_each_drhd_unit(drhd) {
2676                 int i;
2677                 if (drhd->ignored || drhd->include_all)
2678                         continue;
2679
2680                 for (i = 0; i < drhd->devices_cnt; i++)
2681                         if (drhd->devices[i] &&
2682                                 !IS_GFX_DEVICE(drhd->devices[i]))
2683                                 break;
2684
2685                 if (i < drhd->devices_cnt)
2686                         continue;
2687
2688                 /* bypass IOMMU if it is just for gfx devices */
2689                 drhd->ignored = 1;
2690                 for (i = 0; i < drhd->devices_cnt; i++) {
2691                         if (!drhd->devices[i])
2692                                 continue;
2693                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2694                 }
2695         }
2696 }
2697
2698 #ifdef CONFIG_SUSPEND
2699 static int init_iommu_hw(void)
2700 {
2701         struct dmar_drhd_unit *drhd;
2702         struct intel_iommu *iommu = NULL;
2703
2704         for_each_active_iommu(iommu, drhd)
2705                 if (iommu->qi)
2706                         dmar_reenable_qi(iommu);
2707
2708         for_each_active_iommu(iommu, drhd) {
2709                 iommu_flush_write_buffer(iommu);
2710
2711                 iommu_set_root_entry(iommu);
2712
2713                 iommu->flush.flush_context(iommu, 0, 0, 0,
2714                                                 DMA_CCMD_GLOBAL_INVL);
2715                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2716                                                 DMA_TLB_GLOBAL_FLUSH, 0);
2717                 iommu_disable_protect_mem_regions(iommu);
2718                 iommu_enable_translation(iommu);
2719         }
2720
2721         return 0;
2722 }
2723
2724 static void iommu_flush_all(void)
2725 {
2726         struct dmar_drhd_unit *drhd;
2727         struct intel_iommu *iommu;
2728
2729         for_each_active_iommu(iommu, drhd) {
2730                 iommu->flush.flush_context(iommu, 0, 0, 0,
2731                                                 DMA_CCMD_GLOBAL_INVL);
2732                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2733                                                 DMA_TLB_GLOBAL_FLUSH, 0);
2734         }
2735 }
2736
2737 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
2738 {
2739         struct dmar_drhd_unit *drhd;
2740         struct intel_iommu *iommu = NULL;
2741         unsigned long flag;
2742
2743         for_each_active_iommu(iommu, drhd) {
2744                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
2745                                                  GFP_ATOMIC);
2746                 if (!iommu->iommu_state)
2747                         goto nomem;
2748         }
2749
2750         iommu_flush_all();
2751
2752         for_each_active_iommu(iommu, drhd) {
2753                 iommu_disable_translation(iommu);
2754
2755                 spin_lock_irqsave(&iommu->register_lock, flag);
2756
2757                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2758                         readl(iommu->reg + DMAR_FECTL_REG);
2759                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2760                         readl(iommu->reg + DMAR_FEDATA_REG);
2761                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2762                         readl(iommu->reg + DMAR_FEADDR_REG);
2763                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2764                         readl(iommu->reg + DMAR_FEUADDR_REG);
2765
2766                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2767         }
2768         return 0;
2769
2770 nomem:
2771         for_each_active_iommu(iommu, drhd)
2772                 kfree(iommu->iommu_state);
2773
2774         return -ENOMEM;
2775 }
2776
2777 static int iommu_resume(struct sys_device *dev)
2778 {
2779         struct dmar_drhd_unit *drhd;
2780         struct intel_iommu *iommu = NULL;
2781         unsigned long flag;
2782
2783         if (init_iommu_hw()) {
2784                 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2785                 return -EIO;
2786         }
2787
2788         for_each_active_iommu(iommu, drhd) {
2789
2790                 spin_lock_irqsave(&iommu->register_lock, flag);
2791
2792                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2793                         iommu->reg + DMAR_FECTL_REG);
2794                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2795                         iommu->reg + DMAR_FEDATA_REG);
2796                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2797                         iommu->reg + DMAR_FEADDR_REG);
2798                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2799                         iommu->reg + DMAR_FEUADDR_REG);
2800
2801                 spin_unlock_irqrestore(&iommu->register_lock, flag);
2802         }
2803
2804         for_each_active_iommu(iommu, drhd)
2805                 kfree(iommu->iommu_state);
2806
2807         return 0;
2808 }
2809
2810 static struct sysdev_class iommu_sysclass = {
2811         .name           = "iommu",
2812         .resume         = iommu_resume,
2813         .suspend        = iommu_suspend,
2814 };
2815
2816 static struct sys_device device_iommu = {
2817         .cls    = &iommu_sysclass,
2818 };
2819
2820 static int __init init_iommu_sysfs(void)
2821 {
2822         int error;
2823
2824         error = sysdev_class_register(&iommu_sysclass);
2825         if (error)
2826                 return error;
2827
2828         error = sysdev_register(&device_iommu);
2829         if (error)
2830                 sysdev_class_unregister(&iommu_sysclass);
2831
2832         return error;
2833 }
2834
2835 #else
2836 static int __init init_iommu_sysfs(void)
2837 {
2838         return 0;
2839 }
2840 #endif  /* CONFIG_PM */
2841
2842 int __init intel_iommu_init(void)
2843 {
2844         int ret = 0;
2845
2846         if (dmar_table_init())
2847                 return  -ENODEV;
2848
2849         if (dmar_dev_scope_init())
2850                 return  -ENODEV;
2851
2852         /*
2853          * Check the need for DMA-remapping initialization now.
2854          * Above initialization will also be used by Interrupt-remapping.
2855          */
2856         if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
2857                 return -ENODEV;
2858
2859         iommu_init_mempool();
2860         dmar_init_reserved_ranges();
2861
2862         init_no_remapping_devices();
2863
2864         ret = init_dmars();
2865         if (ret) {
2866                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2867                 put_iova_domain(&reserved_iova_list);
2868                 iommu_exit_mempool();
2869                 return ret;
2870         }
2871         printk(KERN_INFO
2872         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2873
2874         init_timer(&unmap_timer);
2875         force_iommu = 1;
2876
2877         if (!iommu_pass_through) {
2878                 printk(KERN_INFO
2879                        "Multi-level page-table translation for DMAR.\n");
2880                 dma_ops = &intel_dma_ops;
2881         } else
2882                 printk(KERN_INFO
2883                        "DMAR: Pass through translation for DMAR.\n");
2884
2885         init_iommu_sysfs();
2886
2887         register_iommu(&intel_iommu_ops);
2888
2889         return 0;
2890 }
2891
2892 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2893                                   struct pci_dev *pdev)
2894 {
2895         struct device_domain_info *info;
2896         unsigned long flags;
2897
2898         info = alloc_devinfo_mem();
2899         if (!info)
2900                 return -ENOMEM;
2901
2902         info->segment = pci_domain_nr(pdev->bus);
2903         info->bus = pdev->bus->number;
2904         info->devfn = pdev->devfn;
2905         info->dev = pdev;
2906         info->domain = domain;
2907
2908         spin_lock_irqsave(&device_domain_lock, flags);
2909         list_add(&info->link, &domain->devices);
2910         list_add(&info->global, &device_domain_list);
2911         pdev->dev.archdata.iommu = info;
2912         spin_unlock_irqrestore(&device_domain_lock, flags);
2913
2914         return 0;
2915 }
2916
2917 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2918                                            struct pci_dev *pdev)
2919 {
2920         struct pci_dev *tmp, *parent;
2921
2922         if (!iommu || !pdev)
2923                 return;
2924
2925         /* dependent device detach */
2926         tmp = pci_find_upstream_pcie_bridge(pdev);
2927         /* Secondary interface's bus number and devfn 0 */
2928         if (tmp) {
2929                 parent = pdev->bus->self;
2930                 while (parent != tmp) {
2931                         iommu_detach_dev(iommu, parent->bus->number,
2932                                          parent->devfn);
2933                         parent = parent->bus->self;
2934                 }
2935                 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2936                         iommu_detach_dev(iommu,
2937                                 tmp->subordinate->number, 0);
2938                 else /* this is a legacy PCI bridge */
2939                         iommu_detach_dev(iommu, tmp->bus->number,
2940                                          tmp->devfn);
2941         }
2942 }
2943
2944 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2945                                           struct pci_dev *pdev)
2946 {
2947         struct device_domain_info *info;
2948         struct intel_iommu *iommu;
2949         unsigned long flags;
2950         int found = 0;
2951         struct list_head *entry, *tmp;
2952
2953         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
2954                                 pdev->devfn);
2955         if (!iommu)
2956                 return;
2957
2958         spin_lock_irqsave(&device_domain_lock, flags);
2959         list_for_each_safe(entry, tmp, &domain->devices) {
2960                 info = list_entry(entry, struct device_domain_info, link);
2961                 /* No need to compare PCI domain; it has to be the same */
2962                 if (info->bus == pdev->bus->number &&
2963                     info->devfn == pdev->devfn) {
2964                         list_del(&info->link);
2965                         list_del(&info->global);
2966                         if (info->dev)
2967                                 info->dev->dev.archdata.iommu = NULL;
2968                         spin_unlock_irqrestore(&device_domain_lock, flags);
2969
2970                         iommu_detach_dev(iommu, info->bus, info->devfn);
2971                         iommu_detach_dependent_devices(iommu, pdev);
2972                         free_devinfo_mem(info);
2973
2974                         spin_lock_irqsave(&device_domain_lock, flags);
2975
2976                         if (found)
2977                                 break;
2978                         else
2979                                 continue;
2980                 }
2981
2982                 /* if there is no other devices under the same iommu
2983                  * owned by this domain, clear this iommu in iommu_bmp
2984                  * update iommu count and coherency
2985                  */
2986                 if (iommu == device_to_iommu(info->segment, info->bus,
2987                                             info->devfn))
2988                         found = 1;
2989         }
2990
2991         if (found == 0) {
2992                 unsigned long tmp_flags;
2993                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2994                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2995                 domain->iommu_count--;
2996                 domain_update_iommu_cap(domain);
2997                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2998         }
2999
3000         spin_unlock_irqrestore(&device_domain_lock, flags);
3001 }
3002
3003 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3004 {
3005         struct device_domain_info *info;
3006         struct intel_iommu *iommu;
3007         unsigned long flags1, flags2;
3008
3009         spin_lock_irqsave(&device_domain_lock, flags1);
3010         while (!list_empty(&domain->devices)) {
3011                 info = list_entry(domain->devices.next,
3012                         struct device_domain_info, link);
3013                 list_del(&info->link);
3014                 list_del(&info->global);
3015                 if (info->dev)
3016                         info->dev->dev.archdata.iommu = NULL;
3017
3018                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3019
3020                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3021                 iommu_detach_dev(iommu, info->bus, info->devfn);
3022                 iommu_detach_dependent_devices(iommu, info->dev);
3023
3024                 /* clear this iommu in iommu_bmp, update iommu count
3025                  * and capabilities
3026                  */
3027                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3028                 if (test_and_clear_bit(iommu->seq_id,
3029                                        &domain->iommu_bmp)) {
3030                         domain->iommu_count--;
3031                         domain_update_iommu_cap(domain);
3032                 }
3033                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3034
3035                 free_devinfo_mem(info);
3036                 spin_lock_irqsave(&device_domain_lock, flags1);
3037         }
3038         spin_unlock_irqrestore(&device_domain_lock, flags1);
3039 }
3040
3041 /* domain id for virtual machine, it won't be set in context */
3042 static unsigned long vm_domid;
3043
3044 static int vm_domain_min_agaw(struct dmar_domain *domain)
3045 {
3046         int i;
3047         int min_agaw = domain->agaw;
3048
3049         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3050         for (; i < g_num_of_iommus; ) {
3051                 if (min_agaw > g_iommus[i]->agaw)
3052                         min_agaw = g_iommus[i]->agaw;
3053
3054                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3055         }
3056
3057         return min_agaw;
3058 }
3059
3060 static struct dmar_domain *iommu_alloc_vm_domain(void)
3061 {
3062         struct dmar_domain *domain;
3063
3064         domain = alloc_domain_mem();
3065         if (!domain)
3066                 return NULL;
3067
3068         domain->id = vm_domid++;
3069         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3070         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3071
3072         return domain;
3073 }
3074
3075 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
3076 {
3077         int adjust_width;
3078
3079         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3080         spin_lock_init(&domain->mapping_lock);
3081         spin_lock_init(&domain->iommu_lock);
3082
3083         domain_reserve_special_ranges(domain);
3084
3085         /* calculate AGAW */
3086         domain->gaw = guest_width;
3087         adjust_width = guestwidth_to_adjustwidth(guest_width);
3088         domain->agaw = width_to_agaw(adjust_width);
3089
3090         INIT_LIST_HEAD(&domain->devices);
3091
3092         domain->iommu_count = 0;
3093         domain->iommu_coherency = 0;
3094         domain->max_addr = 0;
3095
3096         /* always allocate the top pgd */
3097         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3098         if (!domain->pgd)
3099                 return -ENOMEM;
3100         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3101         return 0;
3102 }
3103
3104 static void iommu_free_vm_domain(struct dmar_domain *domain)
3105 {
3106         unsigned long flags;
3107         struct dmar_drhd_unit *drhd;
3108         struct intel_iommu *iommu;
3109         unsigned long i;
3110         unsigned long ndomains;
3111
3112         for_each_drhd_unit(drhd) {
3113                 if (drhd->ignored)
3114                         continue;
3115                 iommu = drhd->iommu;
3116
3117                 ndomains = cap_ndoms(iommu->cap);
3118                 i = find_first_bit(iommu->domain_ids, ndomains);
3119                 for (; i < ndomains; ) {
3120                         if (iommu->domains[i] == domain) {
3121                                 spin_lock_irqsave(&iommu->lock, flags);
3122                                 clear_bit(i, iommu->domain_ids);
3123                                 iommu->domains[i] = NULL;
3124                                 spin_unlock_irqrestore(&iommu->lock, flags);
3125                                 break;
3126                         }
3127                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3128                 }
3129         }
3130 }
3131
3132 static void vm_domain_exit(struct dmar_domain *domain)
3133 {
3134         u64 end;
3135
3136         /* Domain 0 is reserved, so dont process it */
3137         if (!domain)
3138                 return;
3139
3140         vm_domain_remove_all_dev_info(domain);
3141         /* destroy iovas */
3142         put_iova_domain(&domain->iovad);
3143         end = DOMAIN_MAX_ADDR(domain->gaw);
3144         end = end & (~VTD_PAGE_MASK);
3145
3146         /* clear ptes */
3147         dma_pte_clear_range(domain, 0, end);
3148
3149         /* free page tables */
3150         dma_pte_free_pagetable(domain, 0, end);
3151
3152         iommu_free_vm_domain(domain);
3153         free_domain_mem(domain);
3154 }
3155
3156 static int intel_iommu_domain_init(struct iommu_domain *domain)
3157 {
3158         struct dmar_domain *dmar_domain;
3159
3160         dmar_domain = iommu_alloc_vm_domain();
3161         if (!dmar_domain) {
3162                 printk(KERN_ERR
3163                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3164                 return -ENOMEM;
3165         }
3166         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3167                 printk(KERN_ERR
3168                         "intel_iommu_domain_init() failed\n");
3169                 vm_domain_exit(dmar_domain);
3170                 return -ENOMEM;
3171         }
3172         domain->priv = dmar_domain;
3173
3174         return 0;
3175 }
3176
3177 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3178 {
3179         struct dmar_domain *dmar_domain = domain->priv;
3180
3181         domain->priv = NULL;
3182         vm_domain_exit(dmar_domain);
3183 }
3184
3185 static int intel_iommu_attach_device(struct iommu_domain *domain,
3186                                      struct device *dev)
3187 {
3188         struct dmar_domain *dmar_domain = domain->priv;
3189         struct pci_dev *pdev = to_pci_dev(dev);
3190         struct intel_iommu *iommu;
3191         int addr_width;
3192         u64 end;
3193         int ret;
3194
3195         /* normally pdev is not mapped */
3196         if (unlikely(domain_context_mapped(pdev))) {
3197                 struct dmar_domain *old_domain;
3198
3199                 old_domain = find_domain(pdev);
3200                 if (old_domain) {
3201                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3202                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3203                         else
3204                                 domain_remove_dev_info(old_domain);
3205                 }
3206         }
3207
3208         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3209                                 pdev->devfn);
3210         if (!iommu)
3211                 return -ENODEV;
3212
3213         /* check if this iommu agaw is sufficient for max mapped address */
3214         addr_width = agaw_to_width(iommu->agaw);
3215         end = DOMAIN_MAX_ADDR(addr_width);
3216         end = end & VTD_PAGE_MASK;
3217         if (end < dmar_domain->max_addr) {
3218                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3219                        "sufficient for the mapped address (%llx)\n",
3220                        __func__, iommu->agaw, dmar_domain->max_addr);
3221                 return -EFAULT;
3222         }
3223
3224         ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3225         if (ret)
3226                 return ret;
3227
3228         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3229         return ret;
3230 }
3231
3232 static void intel_iommu_detach_device(struct iommu_domain *domain,
3233                                       struct device *dev)
3234 {
3235         struct dmar_domain *dmar_domain = domain->priv;
3236         struct pci_dev *pdev = to_pci_dev(dev);
3237
3238         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3239 }
3240
3241 static int intel_iommu_map_range(struct iommu_domain *domain,
3242                                  unsigned long iova, phys_addr_t hpa,
3243                                  size_t size, int iommu_prot)
3244 {
3245         struct dmar_domain *dmar_domain = domain->priv;
3246         u64 max_addr;
3247         int addr_width;
3248         int prot = 0;
3249         int ret;
3250
3251         if (iommu_prot & IOMMU_READ)
3252                 prot |= DMA_PTE_READ;
3253         if (iommu_prot & IOMMU_WRITE)
3254                 prot |= DMA_PTE_WRITE;
3255         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3256                 prot |= DMA_PTE_SNP;
3257
3258         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3259         if (dmar_domain->max_addr < max_addr) {
3260                 int min_agaw;
3261                 u64 end;
3262
3263                 /* check if minimum agaw is sufficient for mapped address */
3264                 min_agaw = vm_domain_min_agaw(dmar_domain);
3265                 addr_width = agaw_to_width(min_agaw);
3266                 end = DOMAIN_MAX_ADDR(addr_width);
3267                 end = end & VTD_PAGE_MASK;
3268                 if (end < max_addr) {
3269                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3270                                "sufficient for the mapped address (%llx)\n",
3271                                __func__, min_agaw, max_addr);
3272                         return -EFAULT;
3273                 }
3274                 dmar_domain->max_addr = max_addr;
3275         }
3276
3277         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3278         return ret;
3279 }
3280
3281 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3282                                     unsigned long iova, size_t size)
3283 {
3284         struct dmar_domain *dmar_domain = domain->priv;
3285         dma_addr_t base;
3286
3287         /* The address might not be aligned */
3288         base = iova & VTD_PAGE_MASK;
3289         size = VTD_PAGE_ALIGN(size);
3290         dma_pte_clear_range(dmar_domain, base, base + size);
3291
3292         if (dmar_domain->max_addr == base + size)
3293                 dmar_domain->max_addr = base;
3294 }
3295
3296 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3297                                             unsigned long iova)
3298 {
3299         struct dmar_domain *dmar_domain = domain->priv;
3300         struct dma_pte *pte;
3301         u64 phys = 0;
3302
3303         pte = addr_to_dma_pte(dmar_domain, iova);
3304         if (pte)
3305                 phys = dma_pte_addr(pte);
3306
3307         return phys;
3308 }
3309
3310 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3311                                       unsigned long cap)
3312 {
3313         struct dmar_domain *dmar_domain = domain->priv;
3314
3315         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3316                 return dmar_domain->iommu_snooping;
3317
3318         return 0;
3319 }
3320
3321 static struct iommu_ops intel_iommu_ops = {
3322         .domain_init    = intel_iommu_domain_init,
3323         .domain_destroy = intel_iommu_domain_destroy,
3324         .attach_dev     = intel_iommu_attach_device,
3325         .detach_dev     = intel_iommu_detach_device,
3326         .map            = intel_iommu_map_range,
3327         .unmap          = intel_iommu_unmap_range,
3328         .iova_to_phys   = intel_iommu_iova_to_phys,
3329         .domain_has_cap = intel_iommu_domain_has_cap,
3330 };
3331
3332 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3333 {
3334         /*
3335          * Mobile 4 Series Chipset neglects to set RWBF capability,
3336          * but needs it:
3337          */
3338         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3339         rwbf_quirk = 1;
3340 }
3341
3342 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);