intel-iommu: move root entry defs from dma_remapping.h
[firefly-linux-kernel-4.4.55.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /*
61  * 0: Present
62  * 1-11: Reserved
63  * 12-63: Context Ptr (12 - (haw-1))
64  * 64-127: Reserved
65  */
66 struct root_entry {
67         u64     val;
68         u64     rsvd1;
69 };
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
72 {
73         return (root->val & 1);
74 }
75 static inline void set_root_present(struct root_entry *root)
76 {
77         root->val |= 1;
78 }
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
80 {
81         root->val |= value & VTD_PAGE_MASK;
82 }
83
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
86 {
87         return (struct context_entry *)
88                 (root_present(root)?phys_to_virt(
89                 root->val & VTD_PAGE_MASK) :
90                 NULL);
91 }
92
93 static void flush_unmaps_timeout(unsigned long data);
94
95 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
96
97 #define HIGH_WATER_MARK 250
98 struct deferred_flush_tables {
99         int next;
100         struct iova *iova[HIGH_WATER_MARK];
101         struct dmar_domain *domain[HIGH_WATER_MARK];
102 };
103
104 static struct deferred_flush_tables *deferred_flush;
105
106 /* bitmap for indexing intel_iommus */
107 static int g_num_of_iommus;
108
109 static DEFINE_SPINLOCK(async_umap_flush_lock);
110 static LIST_HEAD(unmaps_to_do);
111
112 static int timer_on;
113 static long list_size;
114
115 static void domain_remove_dev_info(struct dmar_domain *domain);
116
117 int dmar_disabled;
118 static int __initdata dmar_map_gfx = 1;
119 static int dmar_forcedac;
120 static int intel_iommu_strict;
121
122 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
123 static DEFINE_SPINLOCK(device_domain_lock);
124 static LIST_HEAD(device_domain_list);
125
126 static int __init intel_iommu_setup(char *str)
127 {
128         if (!str)
129                 return -EINVAL;
130         while (*str) {
131                 if (!strncmp(str, "off", 3)) {
132                         dmar_disabled = 1;
133                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
134                 } else if (!strncmp(str, "igfx_off", 8)) {
135                         dmar_map_gfx = 0;
136                         printk(KERN_INFO
137                                 "Intel-IOMMU: disable GFX device mapping\n");
138                 } else if (!strncmp(str, "forcedac", 8)) {
139                         printk(KERN_INFO
140                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
141                         dmar_forcedac = 1;
142                 } else if (!strncmp(str, "strict", 6)) {
143                         printk(KERN_INFO
144                                 "Intel-IOMMU: disable batched IOTLB flush\n");
145                         intel_iommu_strict = 1;
146                 }
147
148                 str += strcspn(str, ",");
149                 while (*str == ',')
150                         str++;
151         }
152         return 0;
153 }
154 __setup("intel_iommu=", intel_iommu_setup);
155
156 static struct kmem_cache *iommu_domain_cache;
157 static struct kmem_cache *iommu_devinfo_cache;
158 static struct kmem_cache *iommu_iova_cache;
159
160 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
161 {
162         unsigned int flags;
163         void *vaddr;
164
165         /* trying to avoid low memory issues */
166         flags = current->flags & PF_MEMALLOC;
167         current->flags |= PF_MEMALLOC;
168         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
169         current->flags &= (~PF_MEMALLOC | flags);
170         return vaddr;
171 }
172
173
174 static inline void *alloc_pgtable_page(void)
175 {
176         unsigned int flags;
177         void *vaddr;
178
179         /* trying to avoid low memory issues */
180         flags = current->flags & PF_MEMALLOC;
181         current->flags |= PF_MEMALLOC;
182         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
183         current->flags &= (~PF_MEMALLOC | flags);
184         return vaddr;
185 }
186
187 static inline void free_pgtable_page(void *vaddr)
188 {
189         free_page((unsigned long)vaddr);
190 }
191
192 static inline void *alloc_domain_mem(void)
193 {
194         return iommu_kmem_cache_alloc(iommu_domain_cache);
195 }
196
197 static void free_domain_mem(void *vaddr)
198 {
199         kmem_cache_free(iommu_domain_cache, vaddr);
200 }
201
202 static inline void * alloc_devinfo_mem(void)
203 {
204         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
205 }
206
207 static inline void free_devinfo_mem(void *vaddr)
208 {
209         kmem_cache_free(iommu_devinfo_cache, vaddr);
210 }
211
212 struct iova *alloc_iova_mem(void)
213 {
214         return iommu_kmem_cache_alloc(iommu_iova_cache);
215 }
216
217 void free_iova_mem(struct iova *iova)
218 {
219         kmem_cache_free(iommu_iova_cache, iova);
220 }
221
222 /* Gets context entry for a given bus and devfn */
223 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
224                 u8 bus, u8 devfn)
225 {
226         struct root_entry *root;
227         struct context_entry *context;
228         unsigned long phy_addr;
229         unsigned long flags;
230
231         spin_lock_irqsave(&iommu->lock, flags);
232         root = &iommu->root_entry[bus];
233         context = get_context_addr_from_root(root);
234         if (!context) {
235                 context = (struct context_entry *)alloc_pgtable_page();
236                 if (!context) {
237                         spin_unlock_irqrestore(&iommu->lock, flags);
238                         return NULL;
239                 }
240                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
241                 phy_addr = virt_to_phys((void *)context);
242                 set_root_value(root, phy_addr);
243                 set_root_present(root);
244                 __iommu_flush_cache(iommu, root, sizeof(*root));
245         }
246         spin_unlock_irqrestore(&iommu->lock, flags);
247         return &context[devfn];
248 }
249
250 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
251 {
252         struct root_entry *root;
253         struct context_entry *context;
254         int ret;
255         unsigned long flags;
256
257         spin_lock_irqsave(&iommu->lock, flags);
258         root = &iommu->root_entry[bus];
259         context = get_context_addr_from_root(root);
260         if (!context) {
261                 ret = 0;
262                 goto out;
263         }
264         ret = context_present(context[devfn]);
265 out:
266         spin_unlock_irqrestore(&iommu->lock, flags);
267         return ret;
268 }
269
270 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
271 {
272         struct root_entry *root;
273         struct context_entry *context;
274         unsigned long flags;
275
276         spin_lock_irqsave(&iommu->lock, flags);
277         root = &iommu->root_entry[bus];
278         context = get_context_addr_from_root(root);
279         if (context) {
280                 context_clear_entry(context[devfn]);
281                 __iommu_flush_cache(iommu, &context[devfn], \
282                         sizeof(*context));
283         }
284         spin_unlock_irqrestore(&iommu->lock, flags);
285 }
286
287 static void free_context_table(struct intel_iommu *iommu)
288 {
289         struct root_entry *root;
290         int i;
291         unsigned long flags;
292         struct context_entry *context;
293
294         spin_lock_irqsave(&iommu->lock, flags);
295         if (!iommu->root_entry) {
296                 goto out;
297         }
298         for (i = 0; i < ROOT_ENTRY_NR; i++) {
299                 root = &iommu->root_entry[i];
300                 context = get_context_addr_from_root(root);
301                 if (context)
302                         free_pgtable_page(context);
303         }
304         free_pgtable_page(iommu->root_entry);
305         iommu->root_entry = NULL;
306 out:
307         spin_unlock_irqrestore(&iommu->lock, flags);
308 }
309
310 /* page table handling */
311 #define LEVEL_STRIDE            (9)
312 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
313
314 static inline int agaw_to_level(int agaw)
315 {
316         return agaw + 2;
317 }
318
319 static inline int agaw_to_width(int agaw)
320 {
321         return 30 + agaw * LEVEL_STRIDE;
322
323 }
324
325 static inline int width_to_agaw(int width)
326 {
327         return (width - 30) / LEVEL_STRIDE;
328 }
329
330 static inline unsigned int level_to_offset_bits(int level)
331 {
332         return (12 + (level - 1) * LEVEL_STRIDE);
333 }
334
335 static inline int address_level_offset(u64 addr, int level)
336 {
337         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
338 }
339
340 static inline u64 level_mask(int level)
341 {
342         return ((u64)-1 << level_to_offset_bits(level));
343 }
344
345 static inline u64 level_size(int level)
346 {
347         return ((u64)1 << level_to_offset_bits(level));
348 }
349
350 static inline u64 align_to_level(u64 addr, int level)
351 {
352         return ((addr + level_size(level) - 1) & level_mask(level));
353 }
354
355 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
356 {
357         int addr_width = agaw_to_width(domain->agaw);
358         struct dma_pte *parent, *pte = NULL;
359         int level = agaw_to_level(domain->agaw);
360         int offset;
361         unsigned long flags;
362
363         BUG_ON(!domain->pgd);
364
365         addr &= (((u64)1) << addr_width) - 1;
366         parent = domain->pgd;
367
368         spin_lock_irqsave(&domain->mapping_lock, flags);
369         while (level > 0) {
370                 void *tmp_page;
371
372                 offset = address_level_offset(addr, level);
373                 pte = &parent[offset];
374                 if (level == 1)
375                         break;
376
377                 if (!dma_pte_present(*pte)) {
378                         tmp_page = alloc_pgtable_page();
379
380                         if (!tmp_page) {
381                                 spin_unlock_irqrestore(&domain->mapping_lock,
382                                         flags);
383                                 return NULL;
384                         }
385                         __iommu_flush_cache(domain->iommu, tmp_page,
386                                         PAGE_SIZE);
387                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
388                         /*
389                          * high level table always sets r/w, last level page
390                          * table control read/write
391                          */
392                         dma_set_pte_readable(*pte);
393                         dma_set_pte_writable(*pte);
394                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
395                 }
396                 parent = phys_to_virt(dma_pte_addr(*pte));
397                 level--;
398         }
399
400         spin_unlock_irqrestore(&domain->mapping_lock, flags);
401         return pte;
402 }
403
404 /* return address's pte at specific level */
405 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
406                 int level)
407 {
408         struct dma_pte *parent, *pte = NULL;
409         int total = agaw_to_level(domain->agaw);
410         int offset;
411
412         parent = domain->pgd;
413         while (level <= total) {
414                 offset = address_level_offset(addr, total);
415                 pte = &parent[offset];
416                 if (level == total)
417                         return pte;
418
419                 if (!dma_pte_present(*pte))
420                         break;
421                 parent = phys_to_virt(dma_pte_addr(*pte));
422                 total--;
423         }
424         return NULL;
425 }
426
427 /* clear one page's page table */
428 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
429 {
430         struct dma_pte *pte = NULL;
431
432         /* get last level pte */
433         pte = dma_addr_level_pte(domain, addr, 1);
434
435         if (pte) {
436                 dma_clear_pte(*pte);
437                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
438         }
439 }
440
441 /* clear last level pte, a tlb flush should be followed */
442 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
443 {
444         int addr_width = agaw_to_width(domain->agaw);
445
446         start &= (((u64)1) << addr_width) - 1;
447         end &= (((u64)1) << addr_width) - 1;
448         /* in case it's partial page */
449         start = PAGE_ALIGN(start);
450         end &= PAGE_MASK;
451
452         /* we don't need lock here, nobody else touches the iova range */
453         while (start < end) {
454                 dma_pte_clear_one(domain, start);
455                 start += VTD_PAGE_SIZE;
456         }
457 }
458
459 /* free page table pages. last level pte should already be cleared */
460 static void dma_pte_free_pagetable(struct dmar_domain *domain,
461         u64 start, u64 end)
462 {
463         int addr_width = agaw_to_width(domain->agaw);
464         struct dma_pte *pte;
465         int total = agaw_to_level(domain->agaw);
466         int level;
467         u64 tmp;
468
469         start &= (((u64)1) << addr_width) - 1;
470         end &= (((u64)1) << addr_width) - 1;
471
472         /* we don't need lock here, nobody else touches the iova range */
473         level = 2;
474         while (level <= total) {
475                 tmp = align_to_level(start, level);
476                 if (tmp >= end || (tmp + level_size(level) > end))
477                         return;
478
479                 while (tmp < end) {
480                         pte = dma_addr_level_pte(domain, tmp, level);
481                         if (pte) {
482                                 free_pgtable_page(
483                                         phys_to_virt(dma_pte_addr(*pte)));
484                                 dma_clear_pte(*pte);
485                                 __iommu_flush_cache(domain->iommu,
486                                                 pte, sizeof(*pte));
487                         }
488                         tmp += level_size(level);
489                 }
490                 level++;
491         }
492         /* free pgd */
493         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
494                 free_pgtable_page(domain->pgd);
495                 domain->pgd = NULL;
496         }
497 }
498
499 /* iommu handling */
500 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
501 {
502         struct root_entry *root;
503         unsigned long flags;
504
505         root = (struct root_entry *)alloc_pgtable_page();
506         if (!root)
507                 return -ENOMEM;
508
509         __iommu_flush_cache(iommu, root, ROOT_SIZE);
510
511         spin_lock_irqsave(&iommu->lock, flags);
512         iommu->root_entry = root;
513         spin_unlock_irqrestore(&iommu->lock, flags);
514
515         return 0;
516 }
517
518 static void iommu_set_root_entry(struct intel_iommu *iommu)
519 {
520         void *addr;
521         u32 cmd, sts;
522         unsigned long flag;
523
524         addr = iommu->root_entry;
525
526         spin_lock_irqsave(&iommu->register_lock, flag);
527         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
528
529         cmd = iommu->gcmd | DMA_GCMD_SRTP;
530         writel(cmd, iommu->reg + DMAR_GCMD_REG);
531
532         /* Make sure hardware complete it */
533         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
534                 readl, (sts & DMA_GSTS_RTPS), sts);
535
536         spin_unlock_irqrestore(&iommu->register_lock, flag);
537 }
538
539 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
540 {
541         u32 val;
542         unsigned long flag;
543
544         if (!cap_rwbf(iommu->cap))
545                 return;
546         val = iommu->gcmd | DMA_GCMD_WBF;
547
548         spin_lock_irqsave(&iommu->register_lock, flag);
549         writel(val, iommu->reg + DMAR_GCMD_REG);
550
551         /* Make sure hardware complete it */
552         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
553                         readl, (!(val & DMA_GSTS_WBFS)), val);
554
555         spin_unlock_irqrestore(&iommu->register_lock, flag);
556 }
557
558 /* return value determine if we need a write buffer flush */
559 static int __iommu_flush_context(struct intel_iommu *iommu,
560         u16 did, u16 source_id, u8 function_mask, u64 type,
561         int non_present_entry_flush)
562 {
563         u64 val = 0;
564         unsigned long flag;
565
566         /*
567          * In the non-present entry flush case, if hardware doesn't cache
568          * non-present entry we do nothing and if hardware cache non-present
569          * entry, we flush entries of domain 0 (the domain id is used to cache
570          * any non-present entries)
571          */
572         if (non_present_entry_flush) {
573                 if (!cap_caching_mode(iommu->cap))
574                         return 1;
575                 else
576                         did = 0;
577         }
578
579         switch (type) {
580         case DMA_CCMD_GLOBAL_INVL:
581                 val = DMA_CCMD_GLOBAL_INVL;
582                 break;
583         case DMA_CCMD_DOMAIN_INVL:
584                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
585                 break;
586         case DMA_CCMD_DEVICE_INVL:
587                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
588                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
589                 break;
590         default:
591                 BUG();
592         }
593         val |= DMA_CCMD_ICC;
594
595         spin_lock_irqsave(&iommu->register_lock, flag);
596         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
597
598         /* Make sure hardware complete it */
599         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
600                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
601
602         spin_unlock_irqrestore(&iommu->register_lock, flag);
603
604         /* flush context entry will implicitly flush write buffer */
605         return 0;
606 }
607
608 /* return value determine if we need a write buffer flush */
609 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
610         u64 addr, unsigned int size_order, u64 type,
611         int non_present_entry_flush)
612 {
613         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
614         u64 val = 0, val_iva = 0;
615         unsigned long flag;
616
617         /*
618          * In the non-present entry flush case, if hardware doesn't cache
619          * non-present entry we do nothing and if hardware cache non-present
620          * entry, we flush entries of domain 0 (the domain id is used to cache
621          * any non-present entries)
622          */
623         if (non_present_entry_flush) {
624                 if (!cap_caching_mode(iommu->cap))
625                         return 1;
626                 else
627                         did = 0;
628         }
629
630         switch (type) {
631         case DMA_TLB_GLOBAL_FLUSH:
632                 /* global flush doesn't need set IVA_REG */
633                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
634                 break;
635         case DMA_TLB_DSI_FLUSH:
636                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
637                 break;
638         case DMA_TLB_PSI_FLUSH:
639                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
640                 /* Note: always flush non-leaf currently */
641                 val_iva = size_order | addr;
642                 break;
643         default:
644                 BUG();
645         }
646         /* Note: set drain read/write */
647 #if 0
648         /*
649          * This is probably to be super secure.. Looks like we can
650          * ignore it without any impact.
651          */
652         if (cap_read_drain(iommu->cap))
653                 val |= DMA_TLB_READ_DRAIN;
654 #endif
655         if (cap_write_drain(iommu->cap))
656                 val |= DMA_TLB_WRITE_DRAIN;
657
658         spin_lock_irqsave(&iommu->register_lock, flag);
659         /* Note: Only uses first TLB reg currently */
660         if (val_iva)
661                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
662         dmar_writeq(iommu->reg + tlb_offset + 8, val);
663
664         /* Make sure hardware complete it */
665         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
666                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
667
668         spin_unlock_irqrestore(&iommu->register_lock, flag);
669
670         /* check IOTLB invalidation granularity */
671         if (DMA_TLB_IAIG(val) == 0)
672                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
673         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
674                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
675                         (unsigned long long)DMA_TLB_IIRG(type),
676                         (unsigned long long)DMA_TLB_IAIG(val));
677         /* flush iotlb entry will implicitly flush write buffer */
678         return 0;
679 }
680
681 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
682         u64 addr, unsigned int pages, int non_present_entry_flush)
683 {
684         unsigned int mask;
685
686         BUG_ON(addr & (~VTD_PAGE_MASK));
687         BUG_ON(pages == 0);
688
689         /* Fallback to domain selective flush if no PSI support */
690         if (!cap_pgsel_inv(iommu->cap))
691                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
692                                                 DMA_TLB_DSI_FLUSH,
693                                                 non_present_entry_flush);
694
695         /*
696          * PSI requires page size to be 2 ^ x, and the base address is naturally
697          * aligned to the size
698          */
699         mask = ilog2(__roundup_pow_of_two(pages));
700         /* Fallback to domain selective flush if size is too big */
701         if (mask > cap_max_amask_val(iommu->cap))
702                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
703                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
704
705         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
706                                         DMA_TLB_PSI_FLUSH,
707                                         non_present_entry_flush);
708 }
709
710 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
711 {
712         u32 pmen;
713         unsigned long flags;
714
715         spin_lock_irqsave(&iommu->register_lock, flags);
716         pmen = readl(iommu->reg + DMAR_PMEN_REG);
717         pmen &= ~DMA_PMEN_EPM;
718         writel(pmen, iommu->reg + DMAR_PMEN_REG);
719
720         /* wait for the protected region status bit to clear */
721         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
722                 readl, !(pmen & DMA_PMEN_PRS), pmen);
723
724         spin_unlock_irqrestore(&iommu->register_lock, flags);
725 }
726
727 static int iommu_enable_translation(struct intel_iommu *iommu)
728 {
729         u32 sts;
730         unsigned long flags;
731
732         spin_lock_irqsave(&iommu->register_lock, flags);
733         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
734
735         /* Make sure hardware complete it */
736         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
737                 readl, (sts & DMA_GSTS_TES), sts);
738
739         iommu->gcmd |= DMA_GCMD_TE;
740         spin_unlock_irqrestore(&iommu->register_lock, flags);
741         return 0;
742 }
743
744 static int iommu_disable_translation(struct intel_iommu *iommu)
745 {
746         u32 sts;
747         unsigned long flag;
748
749         spin_lock_irqsave(&iommu->register_lock, flag);
750         iommu->gcmd &= ~DMA_GCMD_TE;
751         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
752
753         /* Make sure hardware complete it */
754         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
755                 readl, (!(sts & DMA_GSTS_TES)), sts);
756
757         spin_unlock_irqrestore(&iommu->register_lock, flag);
758         return 0;
759 }
760
761 /* iommu interrupt handling. Most stuff are MSI-like. */
762
763 static const char *fault_reason_strings[] =
764 {
765         "Software",
766         "Present bit in root entry is clear",
767         "Present bit in context entry is clear",
768         "Invalid context entry",
769         "Access beyond MGAW",
770         "PTE Write access is not set",
771         "PTE Read access is not set",
772         "Next page table ptr is invalid",
773         "Root table address invalid",
774         "Context table ptr is invalid",
775         "non-zero reserved fields in RTP",
776         "non-zero reserved fields in CTP",
777         "non-zero reserved fields in PTE",
778 };
779 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
780
781 const char *dmar_get_fault_reason(u8 fault_reason)
782 {
783         if (fault_reason > MAX_FAULT_REASON_IDX)
784                 return "Unknown";
785         else
786                 return fault_reason_strings[fault_reason];
787 }
788
789 void dmar_msi_unmask(unsigned int irq)
790 {
791         struct intel_iommu *iommu = get_irq_data(irq);
792         unsigned long flag;
793
794         /* unmask it */
795         spin_lock_irqsave(&iommu->register_lock, flag);
796         writel(0, iommu->reg + DMAR_FECTL_REG);
797         /* Read a reg to force flush the post write */
798         readl(iommu->reg + DMAR_FECTL_REG);
799         spin_unlock_irqrestore(&iommu->register_lock, flag);
800 }
801
802 void dmar_msi_mask(unsigned int irq)
803 {
804         unsigned long flag;
805         struct intel_iommu *iommu = get_irq_data(irq);
806
807         /* mask it */
808         spin_lock_irqsave(&iommu->register_lock, flag);
809         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
810         /* Read a reg to force flush the post write */
811         readl(iommu->reg + DMAR_FECTL_REG);
812         spin_unlock_irqrestore(&iommu->register_lock, flag);
813 }
814
815 void dmar_msi_write(int irq, struct msi_msg *msg)
816 {
817         struct intel_iommu *iommu = get_irq_data(irq);
818         unsigned long flag;
819
820         spin_lock_irqsave(&iommu->register_lock, flag);
821         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
822         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
823         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
824         spin_unlock_irqrestore(&iommu->register_lock, flag);
825 }
826
827 void dmar_msi_read(int irq, struct msi_msg *msg)
828 {
829         struct intel_iommu *iommu = get_irq_data(irq);
830         unsigned long flag;
831
832         spin_lock_irqsave(&iommu->register_lock, flag);
833         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
834         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
835         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
836         spin_unlock_irqrestore(&iommu->register_lock, flag);
837 }
838
839 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
840                 u8 fault_reason, u16 source_id, unsigned long long addr)
841 {
842         const char *reason;
843
844         reason = dmar_get_fault_reason(fault_reason);
845
846         printk(KERN_ERR
847                 "DMAR:[%s] Request device [%02x:%02x.%d] "
848                 "fault addr %llx \n"
849                 "DMAR:[fault reason %02d] %s\n",
850                 (type ? "DMA Read" : "DMA Write"),
851                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
852                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
853         return 0;
854 }
855
856 #define PRIMARY_FAULT_REG_LEN (16)
857 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
858 {
859         struct intel_iommu *iommu = dev_id;
860         int reg, fault_index;
861         u32 fault_status;
862         unsigned long flag;
863
864         spin_lock_irqsave(&iommu->register_lock, flag);
865         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
866
867         /* TBD: ignore advanced fault log currently */
868         if (!(fault_status & DMA_FSTS_PPF))
869                 goto clear_overflow;
870
871         fault_index = dma_fsts_fault_record_index(fault_status);
872         reg = cap_fault_reg_offset(iommu->cap);
873         while (1) {
874                 u8 fault_reason;
875                 u16 source_id;
876                 u64 guest_addr;
877                 int type;
878                 u32 data;
879
880                 /* highest 32 bits */
881                 data = readl(iommu->reg + reg +
882                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
883                 if (!(data & DMA_FRCD_F))
884                         break;
885
886                 fault_reason = dma_frcd_fault_reason(data);
887                 type = dma_frcd_type(data);
888
889                 data = readl(iommu->reg + reg +
890                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
891                 source_id = dma_frcd_source_id(data);
892
893                 guest_addr = dmar_readq(iommu->reg + reg +
894                                 fault_index * PRIMARY_FAULT_REG_LEN);
895                 guest_addr = dma_frcd_page_addr(guest_addr);
896                 /* clear the fault */
897                 writel(DMA_FRCD_F, iommu->reg + reg +
898                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
899
900                 spin_unlock_irqrestore(&iommu->register_lock, flag);
901
902                 iommu_page_fault_do_one(iommu, type, fault_reason,
903                                 source_id, guest_addr);
904
905                 fault_index++;
906                 if (fault_index > cap_num_fault_regs(iommu->cap))
907                         fault_index = 0;
908                 spin_lock_irqsave(&iommu->register_lock, flag);
909         }
910 clear_overflow:
911         /* clear primary fault overflow */
912         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
913         if (fault_status & DMA_FSTS_PFO)
914                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
915
916         spin_unlock_irqrestore(&iommu->register_lock, flag);
917         return IRQ_HANDLED;
918 }
919
920 int dmar_set_interrupt(struct intel_iommu *iommu)
921 {
922         int irq, ret;
923
924         irq = create_irq();
925         if (!irq) {
926                 printk(KERN_ERR "IOMMU: no free vectors\n");
927                 return -EINVAL;
928         }
929
930         set_irq_data(irq, iommu);
931         iommu->irq = irq;
932
933         ret = arch_setup_dmar_msi(irq);
934         if (ret) {
935                 set_irq_data(irq, NULL);
936                 iommu->irq = 0;
937                 destroy_irq(irq);
938                 return 0;
939         }
940
941         /* Force fault register is cleared */
942         iommu_page_fault(irq, iommu);
943
944         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
945         if (ret)
946                 printk(KERN_ERR "IOMMU: can't request irq\n");
947         return ret;
948 }
949
950 static int iommu_init_domains(struct intel_iommu *iommu)
951 {
952         unsigned long ndomains;
953         unsigned long nlongs;
954
955         ndomains = cap_ndoms(iommu->cap);
956         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
957         nlongs = BITS_TO_LONGS(ndomains);
958
959         /* TBD: there might be 64K domains,
960          * consider other allocation for future chip
961          */
962         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
963         if (!iommu->domain_ids) {
964                 printk(KERN_ERR "Allocating domain id array failed\n");
965                 return -ENOMEM;
966         }
967         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
968                         GFP_KERNEL);
969         if (!iommu->domains) {
970                 printk(KERN_ERR "Allocating domain array failed\n");
971                 kfree(iommu->domain_ids);
972                 return -ENOMEM;
973         }
974
975         spin_lock_init(&iommu->lock);
976
977         /*
978          * if Caching mode is set, then invalid translations are tagged
979          * with domainid 0. Hence we need to pre-allocate it.
980          */
981         if (cap_caching_mode(iommu->cap))
982                 set_bit(0, iommu->domain_ids);
983         return 0;
984 }
985
986
987 static void domain_exit(struct dmar_domain *domain);
988
989 void free_dmar_iommu(struct intel_iommu *iommu)
990 {
991         struct dmar_domain *domain;
992         int i;
993
994         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
995         for (; i < cap_ndoms(iommu->cap); ) {
996                 domain = iommu->domains[i];
997                 clear_bit(i, iommu->domain_ids);
998                 domain_exit(domain);
999                 i = find_next_bit(iommu->domain_ids,
1000                         cap_ndoms(iommu->cap), i+1);
1001         }
1002
1003         if (iommu->gcmd & DMA_GCMD_TE)
1004                 iommu_disable_translation(iommu);
1005
1006         if (iommu->irq) {
1007                 set_irq_data(iommu->irq, NULL);
1008                 /* This will mask the irq */
1009                 free_irq(iommu->irq, iommu);
1010                 destroy_irq(iommu->irq);
1011         }
1012
1013         kfree(iommu->domains);
1014         kfree(iommu->domain_ids);
1015
1016         /* free context mapping */
1017         free_context_table(iommu);
1018 }
1019
1020 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1021 {
1022         unsigned long num;
1023         unsigned long ndomains;
1024         struct dmar_domain *domain;
1025         unsigned long flags;
1026
1027         domain = alloc_domain_mem();
1028         if (!domain)
1029                 return NULL;
1030
1031         ndomains = cap_ndoms(iommu->cap);
1032
1033         spin_lock_irqsave(&iommu->lock, flags);
1034         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1035         if (num >= ndomains) {
1036                 spin_unlock_irqrestore(&iommu->lock, flags);
1037                 free_domain_mem(domain);
1038                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1039                 return NULL;
1040         }
1041
1042         set_bit(num, iommu->domain_ids);
1043         domain->id = num;
1044         domain->iommu = iommu;
1045         iommu->domains[num] = domain;
1046         spin_unlock_irqrestore(&iommu->lock, flags);
1047
1048         return domain;
1049 }
1050
1051 static void iommu_free_domain(struct dmar_domain *domain)
1052 {
1053         unsigned long flags;
1054
1055         spin_lock_irqsave(&domain->iommu->lock, flags);
1056         clear_bit(domain->id, domain->iommu->domain_ids);
1057         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1058 }
1059
1060 static struct iova_domain reserved_iova_list;
1061 static struct lock_class_key reserved_alloc_key;
1062 static struct lock_class_key reserved_rbtree_key;
1063
1064 static void dmar_init_reserved_ranges(void)
1065 {
1066         struct pci_dev *pdev = NULL;
1067         struct iova *iova;
1068         int i;
1069         u64 addr, size;
1070
1071         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1072
1073         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1074                 &reserved_alloc_key);
1075         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1076                 &reserved_rbtree_key);
1077
1078         /* IOAPIC ranges shouldn't be accessed by DMA */
1079         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1080                 IOVA_PFN(IOAPIC_RANGE_END));
1081         if (!iova)
1082                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1083
1084         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1085         for_each_pci_dev(pdev) {
1086                 struct resource *r;
1087
1088                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1089                         r = &pdev->resource[i];
1090                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1091                                 continue;
1092                         addr = r->start;
1093                         addr &= PAGE_MASK;
1094                         size = r->end - addr;
1095                         size = PAGE_ALIGN(size);
1096                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1097                                 IOVA_PFN(size + addr) - 1);
1098                         if (!iova)
1099                                 printk(KERN_ERR "Reserve iova failed\n");
1100                 }
1101         }
1102
1103 }
1104
1105 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1106 {
1107         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1108 }
1109
1110 static inline int guestwidth_to_adjustwidth(int gaw)
1111 {
1112         int agaw;
1113         int r = (gaw - 12) % 9;
1114
1115         if (r == 0)
1116                 agaw = gaw;
1117         else
1118                 agaw = gaw + 9 - r;
1119         if (agaw > 64)
1120                 agaw = 64;
1121         return agaw;
1122 }
1123
1124 static int domain_init(struct dmar_domain *domain, int guest_width)
1125 {
1126         struct intel_iommu *iommu;
1127         int adjust_width, agaw;
1128         unsigned long sagaw;
1129
1130         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1131         spin_lock_init(&domain->mapping_lock);
1132
1133         domain_reserve_special_ranges(domain);
1134
1135         /* calculate AGAW */
1136         iommu = domain->iommu;
1137         if (guest_width > cap_mgaw(iommu->cap))
1138                 guest_width = cap_mgaw(iommu->cap);
1139         domain->gaw = guest_width;
1140         adjust_width = guestwidth_to_adjustwidth(guest_width);
1141         agaw = width_to_agaw(adjust_width);
1142         sagaw = cap_sagaw(iommu->cap);
1143         if (!test_bit(agaw, &sagaw)) {
1144                 /* hardware doesn't support it, choose a bigger one */
1145                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1146                 agaw = find_next_bit(&sagaw, 5, agaw);
1147                 if (agaw >= 5)
1148                         return -ENODEV;
1149         }
1150         domain->agaw = agaw;
1151         INIT_LIST_HEAD(&domain->devices);
1152
1153         /* always allocate the top pgd */
1154         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1155         if (!domain->pgd)
1156                 return -ENOMEM;
1157         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1158         return 0;
1159 }
1160
1161 static void domain_exit(struct dmar_domain *domain)
1162 {
1163         u64 end;
1164
1165         /* Domain 0 is reserved, so dont process it */
1166         if (!domain)
1167                 return;
1168
1169         domain_remove_dev_info(domain);
1170         /* destroy iovas */
1171         put_iova_domain(&domain->iovad);
1172         end = DOMAIN_MAX_ADDR(domain->gaw);
1173         end = end & (~PAGE_MASK);
1174
1175         /* clear ptes */
1176         dma_pte_clear_range(domain, 0, end);
1177
1178         /* free page tables */
1179         dma_pte_free_pagetable(domain, 0, end);
1180
1181         iommu_free_domain(domain);
1182         free_domain_mem(domain);
1183 }
1184
1185 static int domain_context_mapping_one(struct dmar_domain *domain,
1186                 u8 bus, u8 devfn)
1187 {
1188         struct context_entry *context;
1189         struct intel_iommu *iommu = domain->iommu;
1190         unsigned long flags;
1191
1192         pr_debug("Set context mapping for %02x:%02x.%d\n",
1193                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1194         BUG_ON(!domain->pgd);
1195         context = device_to_context_entry(iommu, bus, devfn);
1196         if (!context)
1197                 return -ENOMEM;
1198         spin_lock_irqsave(&iommu->lock, flags);
1199         if (context_present(*context)) {
1200                 spin_unlock_irqrestore(&iommu->lock, flags);
1201                 return 0;
1202         }
1203
1204         context_set_domain_id(*context, domain->id);
1205         context_set_address_width(*context, domain->agaw);
1206         context_set_address_root(*context, virt_to_phys(domain->pgd));
1207         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1208         context_set_fault_enable(*context);
1209         context_set_present(*context);
1210         __iommu_flush_cache(iommu, context, sizeof(*context));
1211
1212         /* it's a non-present to present mapping */
1213         if (iommu->flush.flush_context(iommu, domain->id,
1214                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1215                 DMA_CCMD_DEVICE_INVL, 1))
1216                 iommu_flush_write_buffer(iommu);
1217         else
1218                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1219
1220         spin_unlock_irqrestore(&iommu->lock, flags);
1221         return 0;
1222 }
1223
1224 static int
1225 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1226 {
1227         int ret;
1228         struct pci_dev *tmp, *parent;
1229
1230         ret = domain_context_mapping_one(domain, pdev->bus->number,
1231                 pdev->devfn);
1232         if (ret)
1233                 return ret;
1234
1235         /* dependent device mapping */
1236         tmp = pci_find_upstream_pcie_bridge(pdev);
1237         if (!tmp)
1238                 return 0;
1239         /* Secondary interface's bus number and devfn 0 */
1240         parent = pdev->bus->self;
1241         while (parent != tmp) {
1242                 ret = domain_context_mapping_one(domain, parent->bus->number,
1243                         parent->devfn);
1244                 if (ret)
1245                         return ret;
1246                 parent = parent->bus->self;
1247         }
1248         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1249                 return domain_context_mapping_one(domain,
1250                         tmp->subordinate->number, 0);
1251         else /* this is a legacy PCI bridge */
1252                 return domain_context_mapping_one(domain,
1253                         tmp->bus->number, tmp->devfn);
1254 }
1255
1256 static int domain_context_mapped(struct dmar_domain *domain,
1257         struct pci_dev *pdev)
1258 {
1259         int ret;
1260         struct pci_dev *tmp, *parent;
1261
1262         ret = device_context_mapped(domain->iommu,
1263                 pdev->bus->number, pdev->devfn);
1264         if (!ret)
1265                 return ret;
1266         /* dependent device mapping */
1267         tmp = pci_find_upstream_pcie_bridge(pdev);
1268         if (!tmp)
1269                 return ret;
1270         /* Secondary interface's bus number and devfn 0 */
1271         parent = pdev->bus->self;
1272         while (parent != tmp) {
1273                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1274                         parent->devfn);
1275                 if (!ret)
1276                         return ret;
1277                 parent = parent->bus->self;
1278         }
1279         if (tmp->is_pcie)
1280                 return device_context_mapped(domain->iommu,
1281                         tmp->subordinate->number, 0);
1282         else
1283                 return device_context_mapped(domain->iommu,
1284                         tmp->bus->number, tmp->devfn);
1285 }
1286
1287 static int
1288 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1289                         u64 hpa, size_t size, int prot)
1290 {
1291         u64 start_pfn, end_pfn;
1292         struct dma_pte *pte;
1293         int index;
1294         int addr_width = agaw_to_width(domain->agaw);
1295
1296         hpa &= (((u64)1) << addr_width) - 1;
1297
1298         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1299                 return -EINVAL;
1300         iova &= PAGE_MASK;
1301         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1302         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1303         index = 0;
1304         while (start_pfn < end_pfn) {
1305                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1306                 if (!pte)
1307                         return -ENOMEM;
1308                 /* We don't need lock here, nobody else
1309                  * touches the iova range
1310                  */
1311                 BUG_ON(dma_pte_addr(*pte));
1312                 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1313                 dma_set_pte_prot(*pte, prot);
1314                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1315                 start_pfn++;
1316                 index++;
1317         }
1318         return 0;
1319 }
1320
1321 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1322 {
1323         clear_context_table(domain->iommu, bus, devfn);
1324         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1325                                            DMA_CCMD_GLOBAL_INVL, 0);
1326         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1327                                          DMA_TLB_GLOBAL_FLUSH, 0);
1328 }
1329
1330 static void domain_remove_dev_info(struct dmar_domain *domain)
1331 {
1332         struct device_domain_info *info;
1333         unsigned long flags;
1334
1335         spin_lock_irqsave(&device_domain_lock, flags);
1336         while (!list_empty(&domain->devices)) {
1337                 info = list_entry(domain->devices.next,
1338                         struct device_domain_info, link);
1339                 list_del(&info->link);
1340                 list_del(&info->global);
1341                 if (info->dev)
1342                         info->dev->dev.archdata.iommu = NULL;
1343                 spin_unlock_irqrestore(&device_domain_lock, flags);
1344
1345                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1346                 free_devinfo_mem(info);
1347
1348                 spin_lock_irqsave(&device_domain_lock, flags);
1349         }
1350         spin_unlock_irqrestore(&device_domain_lock, flags);
1351 }
1352
1353 /*
1354  * find_domain
1355  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1356  */
1357 static struct dmar_domain *
1358 find_domain(struct pci_dev *pdev)
1359 {
1360         struct device_domain_info *info;
1361
1362         /* No lock here, assumes no domain exit in normal case */
1363         info = pdev->dev.archdata.iommu;
1364         if (info)
1365                 return info->domain;
1366         return NULL;
1367 }
1368
1369 /* domain is initialized */
1370 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1371 {
1372         struct dmar_domain *domain, *found = NULL;
1373         struct intel_iommu *iommu;
1374         struct dmar_drhd_unit *drhd;
1375         struct device_domain_info *info, *tmp;
1376         struct pci_dev *dev_tmp;
1377         unsigned long flags;
1378         int bus = 0, devfn = 0;
1379
1380         domain = find_domain(pdev);
1381         if (domain)
1382                 return domain;
1383
1384         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1385         if (dev_tmp) {
1386                 if (dev_tmp->is_pcie) {
1387                         bus = dev_tmp->subordinate->number;
1388                         devfn = 0;
1389                 } else {
1390                         bus = dev_tmp->bus->number;
1391                         devfn = dev_tmp->devfn;
1392                 }
1393                 spin_lock_irqsave(&device_domain_lock, flags);
1394                 list_for_each_entry(info, &device_domain_list, global) {
1395                         if (info->bus == bus && info->devfn == devfn) {
1396                                 found = info->domain;
1397                                 break;
1398                         }
1399                 }
1400                 spin_unlock_irqrestore(&device_domain_lock, flags);
1401                 /* pcie-pci bridge already has a domain, uses it */
1402                 if (found) {
1403                         domain = found;
1404                         goto found_domain;
1405                 }
1406         }
1407
1408         /* Allocate new domain for the device */
1409         drhd = dmar_find_matched_drhd_unit(pdev);
1410         if (!drhd) {
1411                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1412                         pci_name(pdev));
1413                 return NULL;
1414         }
1415         iommu = drhd->iommu;
1416
1417         domain = iommu_alloc_domain(iommu);
1418         if (!domain)
1419                 goto error;
1420
1421         if (domain_init(domain, gaw)) {
1422                 domain_exit(domain);
1423                 goto error;
1424         }
1425
1426         /* register pcie-to-pci device */
1427         if (dev_tmp) {
1428                 info = alloc_devinfo_mem();
1429                 if (!info) {
1430                         domain_exit(domain);
1431                         goto error;
1432                 }
1433                 info->bus = bus;
1434                 info->devfn = devfn;
1435                 info->dev = NULL;
1436                 info->domain = domain;
1437                 /* This domain is shared by devices under p2p bridge */
1438                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1439
1440                 /* pcie-to-pci bridge already has a domain, uses it */
1441                 found = NULL;
1442                 spin_lock_irqsave(&device_domain_lock, flags);
1443                 list_for_each_entry(tmp, &device_domain_list, global) {
1444                         if (tmp->bus == bus && tmp->devfn == devfn) {
1445                                 found = tmp->domain;
1446                                 break;
1447                         }
1448                 }
1449                 if (found) {
1450                         free_devinfo_mem(info);
1451                         domain_exit(domain);
1452                         domain = found;
1453                 } else {
1454                         list_add(&info->link, &domain->devices);
1455                         list_add(&info->global, &device_domain_list);
1456                 }
1457                 spin_unlock_irqrestore(&device_domain_lock, flags);
1458         }
1459
1460 found_domain:
1461         info = alloc_devinfo_mem();
1462         if (!info)
1463                 goto error;
1464         info->bus = pdev->bus->number;
1465         info->devfn = pdev->devfn;
1466         info->dev = pdev;
1467         info->domain = domain;
1468         spin_lock_irqsave(&device_domain_lock, flags);
1469         /* somebody is fast */
1470         found = find_domain(pdev);
1471         if (found != NULL) {
1472                 spin_unlock_irqrestore(&device_domain_lock, flags);
1473                 if (found != domain) {
1474                         domain_exit(domain);
1475                         domain = found;
1476                 }
1477                 free_devinfo_mem(info);
1478                 return domain;
1479         }
1480         list_add(&info->link, &domain->devices);
1481         list_add(&info->global, &device_domain_list);
1482         pdev->dev.archdata.iommu = info;
1483         spin_unlock_irqrestore(&device_domain_lock, flags);
1484         return domain;
1485 error:
1486         /* recheck it here, maybe others set it */
1487         return find_domain(pdev);
1488 }
1489
1490 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1491                                       unsigned long long start,
1492                                       unsigned long long end)
1493 {
1494         struct dmar_domain *domain;
1495         unsigned long size;
1496         unsigned long long base;
1497         int ret;
1498
1499         printk(KERN_INFO
1500                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1501                 pci_name(pdev), start, end);
1502         /* page table init */
1503         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1504         if (!domain)
1505                 return -ENOMEM;
1506
1507         /* The address might not be aligned */
1508         base = start & PAGE_MASK;
1509         size = end - base;
1510         size = PAGE_ALIGN(size);
1511         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1512                         IOVA_PFN(base + size) - 1)) {
1513                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1514                 ret = -ENOMEM;
1515                 goto error;
1516         }
1517
1518         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1519                 size, base, pci_name(pdev));
1520         /*
1521          * RMRR range might have overlap with physical memory range,
1522          * clear it first
1523          */
1524         dma_pte_clear_range(domain, base, base + size);
1525
1526         ret = domain_page_mapping(domain, base, base, size,
1527                 DMA_PTE_READ|DMA_PTE_WRITE);
1528         if (ret)
1529                 goto error;
1530
1531         /* context entry init */
1532         ret = domain_context_mapping(domain, pdev);
1533         if (!ret)
1534                 return 0;
1535 error:
1536         domain_exit(domain);
1537         return ret;
1538
1539 }
1540
1541 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1542         struct pci_dev *pdev)
1543 {
1544         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1545                 return 0;
1546         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1547                 rmrr->end_address + 1);
1548 }
1549
1550 #ifdef CONFIG_DMAR_GFX_WA
1551 struct iommu_prepare_data {
1552         struct pci_dev *pdev;
1553         int ret;
1554 };
1555
1556 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1557                                          unsigned long end_pfn, void *datax)
1558 {
1559         struct iommu_prepare_data *data;
1560
1561         data = (struct iommu_prepare_data *)datax;
1562
1563         data->ret = iommu_prepare_identity_map(data->pdev,
1564                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1565         return data->ret;
1566
1567 }
1568
1569 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1570 {
1571         int nid;
1572         struct iommu_prepare_data data;
1573
1574         data.pdev = pdev;
1575         data.ret = 0;
1576
1577         for_each_online_node(nid) {
1578                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1579                 if (data.ret)
1580                         return data.ret;
1581         }
1582         return data.ret;
1583 }
1584
1585 static void __init iommu_prepare_gfx_mapping(void)
1586 {
1587         struct pci_dev *pdev = NULL;
1588         int ret;
1589
1590         for_each_pci_dev(pdev) {
1591                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1592                                 !IS_GFX_DEVICE(pdev))
1593                         continue;
1594                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1595                         pci_name(pdev));
1596                 ret = iommu_prepare_with_active_regions(pdev);
1597                 if (ret)
1598                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1599         }
1600 }
1601 #endif
1602
1603 #ifdef CONFIG_DMAR_FLOPPY_WA
1604 static inline void iommu_prepare_isa(void)
1605 {
1606         struct pci_dev *pdev;
1607         int ret;
1608
1609         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1610         if (!pdev)
1611                 return;
1612
1613         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1614         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1615
1616         if (ret)
1617                 printk("IOMMU: Failed to create 0-64M identity map, "
1618                         "floppy might not work\n");
1619
1620 }
1621 #else
1622 static inline void iommu_prepare_isa(void)
1623 {
1624         return;
1625 }
1626 #endif /* !CONFIG_DMAR_FLPY_WA */
1627
1628 static int __init init_dmars(void)
1629 {
1630         struct dmar_drhd_unit *drhd;
1631         struct dmar_rmrr_unit *rmrr;
1632         struct pci_dev *pdev;
1633         struct intel_iommu *iommu;
1634         int i, ret, unit = 0;
1635
1636         /*
1637          * for each drhd
1638          *    allocate root
1639          *    initialize and program root entry to not present
1640          * endfor
1641          */
1642         for_each_drhd_unit(drhd) {
1643                 g_num_of_iommus++;
1644                 /*
1645                  * lock not needed as this is only incremented in the single
1646                  * threaded kernel __init code path all other access are read
1647                  * only
1648                  */
1649         }
1650
1651         deferred_flush = kzalloc(g_num_of_iommus *
1652                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1653         if (!deferred_flush) {
1654                 ret = -ENOMEM;
1655                 goto error;
1656         }
1657
1658         for_each_drhd_unit(drhd) {
1659                 if (drhd->ignored)
1660                         continue;
1661
1662                 iommu = drhd->iommu;
1663
1664                 ret = iommu_init_domains(iommu);
1665                 if (ret)
1666                         goto error;
1667
1668                 /*
1669                  * TBD:
1670                  * we could share the same root & context tables
1671                  * amoung all IOMMU's. Need to Split it later.
1672                  */
1673                 ret = iommu_alloc_root_entry(iommu);
1674                 if (ret) {
1675                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1676                         goto error;
1677                 }
1678         }
1679
1680         for_each_drhd_unit(drhd) {
1681                 if (drhd->ignored)
1682                         continue;
1683
1684                 iommu = drhd->iommu;
1685                 if (dmar_enable_qi(iommu)) {
1686                         /*
1687                          * Queued Invalidate not enabled, use Register Based
1688                          * Invalidate
1689                          */
1690                         iommu->flush.flush_context = __iommu_flush_context;
1691                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1692                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1693                                "invalidation\n",
1694                                (unsigned long long)drhd->reg_base_addr);
1695                 } else {
1696                         iommu->flush.flush_context = qi_flush_context;
1697                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1698                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1699                                "invalidation\n",
1700                                (unsigned long long)drhd->reg_base_addr);
1701                 }
1702         }
1703
1704         /*
1705          * For each rmrr
1706          *   for each dev attached to rmrr
1707          *   do
1708          *     locate drhd for dev, alloc domain for dev
1709          *     allocate free domain
1710          *     allocate page table entries for rmrr
1711          *     if context not allocated for bus
1712          *           allocate and init context
1713          *           set present in root table for this bus
1714          *     init context with domain, translation etc
1715          *    endfor
1716          * endfor
1717          */
1718         for_each_rmrr_units(rmrr) {
1719                 for (i = 0; i < rmrr->devices_cnt; i++) {
1720                         pdev = rmrr->devices[i];
1721                         /* some BIOS lists non-exist devices in DMAR table */
1722                         if (!pdev)
1723                                 continue;
1724                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1725                         if (ret)
1726                                 printk(KERN_ERR
1727                                  "IOMMU: mapping reserved region failed\n");
1728                 }
1729         }
1730
1731         iommu_prepare_gfx_mapping();
1732
1733         iommu_prepare_isa();
1734
1735         /*
1736          * for each drhd
1737          *   enable fault log
1738          *   global invalidate context cache
1739          *   global invalidate iotlb
1740          *   enable translation
1741          */
1742         for_each_drhd_unit(drhd) {
1743                 if (drhd->ignored)
1744                         continue;
1745                 iommu = drhd->iommu;
1746                 sprintf (iommu->name, "dmar%d", unit++);
1747
1748                 iommu_flush_write_buffer(iommu);
1749
1750                 ret = dmar_set_interrupt(iommu);
1751                 if (ret)
1752                         goto error;
1753
1754                 iommu_set_root_entry(iommu);
1755
1756                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1757                                            0);
1758                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1759                                          0);
1760                 iommu_disable_protect_mem_regions(iommu);
1761
1762                 ret = iommu_enable_translation(iommu);
1763                 if (ret)
1764                         goto error;
1765         }
1766
1767         return 0;
1768 error:
1769         for_each_drhd_unit(drhd) {
1770                 if (drhd->ignored)
1771                         continue;
1772                 iommu = drhd->iommu;
1773                 free_iommu(iommu);
1774         }
1775         return ret;
1776 }
1777
1778 static inline u64 aligned_size(u64 host_addr, size_t size)
1779 {
1780         u64 addr;
1781         addr = (host_addr & (~PAGE_MASK)) + size;
1782         return PAGE_ALIGN(addr);
1783 }
1784
1785 struct iova *
1786 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1787 {
1788         struct iova *piova;
1789
1790         /* Make sure it's in range */
1791         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1792         if (!size || (IOVA_START_ADDR + size > end))
1793                 return NULL;
1794
1795         piova = alloc_iova(&domain->iovad,
1796                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1797         return piova;
1798 }
1799
1800 static struct iova *
1801 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1802                    size_t size, u64 dma_mask)
1803 {
1804         struct pci_dev *pdev = to_pci_dev(dev);
1805         struct iova *iova = NULL;
1806
1807         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1808                 iova = iommu_alloc_iova(domain, size, dma_mask);
1809         else {
1810                 /*
1811                  * First try to allocate an io virtual address in
1812                  * DMA_32BIT_MASK and if that fails then try allocating
1813                  * from higher range
1814                  */
1815                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1816                 if (!iova)
1817                         iova = iommu_alloc_iova(domain, size, dma_mask);
1818         }
1819
1820         if (!iova) {
1821                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1822                 return NULL;
1823         }
1824
1825         return iova;
1826 }
1827
1828 static struct dmar_domain *
1829 get_valid_domain_for_dev(struct pci_dev *pdev)
1830 {
1831         struct dmar_domain *domain;
1832         int ret;
1833
1834         domain = get_domain_for_dev(pdev,
1835                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1836         if (!domain) {
1837                 printk(KERN_ERR
1838                         "Allocating domain for %s failed", pci_name(pdev));
1839                 return NULL;
1840         }
1841
1842         /* make sure context mapping is ok */
1843         if (unlikely(!domain_context_mapped(domain, pdev))) {
1844                 ret = domain_context_mapping(domain, pdev);
1845                 if (ret) {
1846                         printk(KERN_ERR
1847                                 "Domain context map for %s failed",
1848                                 pci_name(pdev));
1849                         return NULL;
1850                 }
1851         }
1852
1853         return domain;
1854 }
1855
1856 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1857                                      size_t size, int dir, u64 dma_mask)
1858 {
1859         struct pci_dev *pdev = to_pci_dev(hwdev);
1860         struct dmar_domain *domain;
1861         phys_addr_t start_paddr;
1862         struct iova *iova;
1863         int prot = 0;
1864         int ret;
1865
1866         BUG_ON(dir == DMA_NONE);
1867         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1868                 return paddr;
1869
1870         domain = get_valid_domain_for_dev(pdev);
1871         if (!domain)
1872                 return 0;
1873
1874         size = aligned_size((u64)paddr, size);
1875
1876         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1877         if (!iova)
1878                 goto error;
1879
1880         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1881
1882         /*
1883          * Check if DMAR supports zero-length reads on write only
1884          * mappings..
1885          */
1886         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1887                         !cap_zlr(domain->iommu->cap))
1888                 prot |= DMA_PTE_READ;
1889         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1890                 prot |= DMA_PTE_WRITE;
1891         /*
1892          * paddr - (paddr + size) might be partial page, we should map the whole
1893          * page.  Note: if two part of one page are separately mapped, we
1894          * might have two guest_addr mapping to the same host paddr, but this
1895          * is not a big problem
1896          */
1897         ret = domain_page_mapping(domain, start_paddr,
1898                 ((u64)paddr) & PAGE_MASK, size, prot);
1899         if (ret)
1900                 goto error;
1901
1902         /* it's a non-present to present mapping */
1903         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1904                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
1905         if (ret)
1906                 iommu_flush_write_buffer(domain->iommu);
1907
1908         return start_paddr + ((u64)paddr & (~PAGE_MASK));
1909
1910 error:
1911         if (iova)
1912                 __free_iova(&domain->iovad, iova);
1913         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1914                 pci_name(pdev), size, (unsigned long long)paddr, dir);
1915         return 0;
1916 }
1917
1918 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
1919                             size_t size, int dir)
1920 {
1921         return __intel_map_single(hwdev, paddr, size, dir,
1922                                   to_pci_dev(hwdev)->dma_mask);
1923 }
1924
1925 static void flush_unmaps(void)
1926 {
1927         int i, j;
1928
1929         timer_on = 0;
1930
1931         /* just flush them all */
1932         for (i = 0; i < g_num_of_iommus; i++) {
1933                 if (deferred_flush[i].next) {
1934                         struct intel_iommu *iommu =
1935                                 deferred_flush[i].domain[0]->iommu;
1936
1937                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1938                                                  DMA_TLB_GLOBAL_FLUSH, 0);
1939                         for (j = 0; j < deferred_flush[i].next; j++) {
1940                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
1941                                                 deferred_flush[i].iova[j]);
1942                         }
1943                         deferred_flush[i].next = 0;
1944                 }
1945         }
1946
1947         list_size = 0;
1948 }
1949
1950 static void flush_unmaps_timeout(unsigned long data)
1951 {
1952         unsigned long flags;
1953
1954         spin_lock_irqsave(&async_umap_flush_lock, flags);
1955         flush_unmaps();
1956         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1957 }
1958
1959 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1960 {
1961         unsigned long flags;
1962         int next, iommu_id;
1963
1964         spin_lock_irqsave(&async_umap_flush_lock, flags);
1965         if (list_size == HIGH_WATER_MARK)
1966                 flush_unmaps();
1967
1968         iommu_id = dom->iommu->seq_id;
1969
1970         next = deferred_flush[iommu_id].next;
1971         deferred_flush[iommu_id].domain[next] = dom;
1972         deferred_flush[iommu_id].iova[next] = iova;
1973         deferred_flush[iommu_id].next++;
1974
1975         if (!timer_on) {
1976                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1977                 timer_on = 1;
1978         }
1979         list_size++;
1980         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1981 }
1982
1983 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
1984                         int dir)
1985 {
1986         struct pci_dev *pdev = to_pci_dev(dev);
1987         struct dmar_domain *domain;
1988         unsigned long start_addr;
1989         struct iova *iova;
1990
1991         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1992                 return;
1993         domain = find_domain(pdev);
1994         BUG_ON(!domain);
1995
1996         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1997         if (!iova)
1998                 return;
1999
2000         start_addr = iova->pfn_lo << PAGE_SHIFT;
2001         size = aligned_size((u64)dev_addr, size);
2002
2003         pr_debug("Device %s unmapping: %lx@%llx\n",
2004                 pci_name(pdev), size, (unsigned long long)start_addr);
2005
2006         /*  clear the whole page */
2007         dma_pte_clear_range(domain, start_addr, start_addr + size);
2008         /* free page tables */
2009         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2010         if (intel_iommu_strict) {
2011                 if (iommu_flush_iotlb_psi(domain->iommu,
2012                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2013                         iommu_flush_write_buffer(domain->iommu);
2014                 /* free iova */
2015                 __free_iova(&domain->iovad, iova);
2016         } else {
2017                 add_unmap(domain, iova);
2018                 /*
2019                  * queue up the release of the unmap to save the 1/6th of the
2020                  * cpu used up by the iotlb flush operation...
2021                  */
2022         }
2023 }
2024
2025 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2026                            dma_addr_t *dma_handle, gfp_t flags)
2027 {
2028         void *vaddr;
2029         int order;
2030
2031         size = PAGE_ALIGN(size);
2032         order = get_order(size);
2033         flags &= ~(GFP_DMA | GFP_DMA32);
2034
2035         vaddr = (void *)__get_free_pages(flags, order);
2036         if (!vaddr)
2037                 return NULL;
2038         memset(vaddr, 0, size);
2039
2040         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2041                                          DMA_BIDIRECTIONAL,
2042                                          hwdev->coherent_dma_mask);
2043         if (*dma_handle)
2044                 return vaddr;
2045         free_pages((unsigned long)vaddr, order);
2046         return NULL;
2047 }
2048
2049 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2050                          dma_addr_t dma_handle)
2051 {
2052         int order;
2053
2054         size = PAGE_ALIGN(size);
2055         order = get_order(size);
2056
2057         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2058         free_pages((unsigned long)vaddr, order);
2059 }
2060
2061 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2062
2063 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2064                     int nelems, int dir)
2065 {
2066         int i;
2067         struct pci_dev *pdev = to_pci_dev(hwdev);
2068         struct dmar_domain *domain;
2069         unsigned long start_addr;
2070         struct iova *iova;
2071         size_t size = 0;
2072         void *addr;
2073         struct scatterlist *sg;
2074
2075         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2076                 return;
2077
2078         domain = find_domain(pdev);
2079
2080         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2081         if (!iova)
2082                 return;
2083         for_each_sg(sglist, sg, nelems, i) {
2084                 addr = SG_ENT_VIRT_ADDRESS(sg);
2085                 size += aligned_size((u64)addr, sg->length);
2086         }
2087
2088         start_addr = iova->pfn_lo << PAGE_SHIFT;
2089
2090         /*  clear the whole page */
2091         dma_pte_clear_range(domain, start_addr, start_addr + size);
2092         /* free page tables */
2093         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2094
2095         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2096                         size >> VTD_PAGE_SHIFT, 0))
2097                 iommu_flush_write_buffer(domain->iommu);
2098
2099         /* free iova */
2100         __free_iova(&domain->iovad, iova);
2101 }
2102
2103 static int intel_nontranslate_map_sg(struct device *hddev,
2104         struct scatterlist *sglist, int nelems, int dir)
2105 {
2106         int i;
2107         struct scatterlist *sg;
2108
2109         for_each_sg(sglist, sg, nelems, i) {
2110                 BUG_ON(!sg_page(sg));
2111                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2112                 sg->dma_length = sg->length;
2113         }
2114         return nelems;
2115 }
2116
2117 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2118                  int dir)
2119 {
2120         void *addr;
2121         int i;
2122         struct pci_dev *pdev = to_pci_dev(hwdev);
2123         struct dmar_domain *domain;
2124         size_t size = 0;
2125         int prot = 0;
2126         size_t offset = 0;
2127         struct iova *iova = NULL;
2128         int ret;
2129         struct scatterlist *sg;
2130         unsigned long start_addr;
2131
2132         BUG_ON(dir == DMA_NONE);
2133         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2134                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2135
2136         domain = get_valid_domain_for_dev(pdev);
2137         if (!domain)
2138                 return 0;
2139
2140         for_each_sg(sglist, sg, nelems, i) {
2141                 addr = SG_ENT_VIRT_ADDRESS(sg);
2142                 addr = (void *)virt_to_phys(addr);
2143                 size += aligned_size((u64)addr, sg->length);
2144         }
2145
2146         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2147         if (!iova) {
2148                 sglist->dma_length = 0;
2149                 return 0;
2150         }
2151
2152         /*
2153          * Check if DMAR supports zero-length reads on write only
2154          * mappings..
2155          */
2156         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2157                         !cap_zlr(domain->iommu->cap))
2158                 prot |= DMA_PTE_READ;
2159         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2160                 prot |= DMA_PTE_WRITE;
2161
2162         start_addr = iova->pfn_lo << PAGE_SHIFT;
2163         offset = 0;
2164         for_each_sg(sglist, sg, nelems, i) {
2165                 addr = SG_ENT_VIRT_ADDRESS(sg);
2166                 addr = (void *)virt_to_phys(addr);
2167                 size = aligned_size((u64)addr, sg->length);
2168                 ret = domain_page_mapping(domain, start_addr + offset,
2169                         ((u64)addr) & PAGE_MASK,
2170                         size, prot);
2171                 if (ret) {
2172                         /*  clear the page */
2173                         dma_pte_clear_range(domain, start_addr,
2174                                   start_addr + offset);
2175                         /* free page tables */
2176                         dma_pte_free_pagetable(domain, start_addr,
2177                                   start_addr + offset);
2178                         /* free iova */
2179                         __free_iova(&domain->iovad, iova);
2180                         return 0;
2181                 }
2182                 sg->dma_address = start_addr + offset +
2183                                 ((u64)addr & (~PAGE_MASK));
2184                 sg->dma_length = sg->length;
2185                 offset += size;
2186         }
2187
2188         /* it's a non-present to present mapping */
2189         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2190                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2191                 iommu_flush_write_buffer(domain->iommu);
2192         return nelems;
2193 }
2194
2195 static struct dma_mapping_ops intel_dma_ops = {
2196         .alloc_coherent = intel_alloc_coherent,
2197         .free_coherent = intel_free_coherent,
2198         .map_single = intel_map_single,
2199         .unmap_single = intel_unmap_single,
2200         .map_sg = intel_map_sg,
2201         .unmap_sg = intel_unmap_sg,
2202 };
2203
2204 static inline int iommu_domain_cache_init(void)
2205 {
2206         int ret = 0;
2207
2208         iommu_domain_cache = kmem_cache_create("iommu_domain",
2209                                          sizeof(struct dmar_domain),
2210                                          0,
2211                                          SLAB_HWCACHE_ALIGN,
2212
2213                                          NULL);
2214         if (!iommu_domain_cache) {
2215                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2216                 ret = -ENOMEM;
2217         }
2218
2219         return ret;
2220 }
2221
2222 static inline int iommu_devinfo_cache_init(void)
2223 {
2224         int ret = 0;
2225
2226         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2227                                          sizeof(struct device_domain_info),
2228                                          0,
2229                                          SLAB_HWCACHE_ALIGN,
2230                                          NULL);
2231         if (!iommu_devinfo_cache) {
2232                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2233                 ret = -ENOMEM;
2234         }
2235
2236         return ret;
2237 }
2238
2239 static inline int iommu_iova_cache_init(void)
2240 {
2241         int ret = 0;
2242
2243         iommu_iova_cache = kmem_cache_create("iommu_iova",
2244                                          sizeof(struct iova),
2245                                          0,
2246                                          SLAB_HWCACHE_ALIGN,
2247                                          NULL);
2248         if (!iommu_iova_cache) {
2249                 printk(KERN_ERR "Couldn't create iova cache\n");
2250                 ret = -ENOMEM;
2251         }
2252
2253         return ret;
2254 }
2255
2256 static int __init iommu_init_mempool(void)
2257 {
2258         int ret;
2259         ret = iommu_iova_cache_init();
2260         if (ret)
2261                 return ret;
2262
2263         ret = iommu_domain_cache_init();
2264         if (ret)
2265                 goto domain_error;
2266
2267         ret = iommu_devinfo_cache_init();
2268         if (!ret)
2269                 return ret;
2270
2271         kmem_cache_destroy(iommu_domain_cache);
2272 domain_error:
2273         kmem_cache_destroy(iommu_iova_cache);
2274
2275         return -ENOMEM;
2276 }
2277
2278 static void __init iommu_exit_mempool(void)
2279 {
2280         kmem_cache_destroy(iommu_devinfo_cache);
2281         kmem_cache_destroy(iommu_domain_cache);
2282         kmem_cache_destroy(iommu_iova_cache);
2283
2284 }
2285
2286 static void __init init_no_remapping_devices(void)
2287 {
2288         struct dmar_drhd_unit *drhd;
2289
2290         for_each_drhd_unit(drhd) {
2291                 if (!drhd->include_all) {
2292                         int i;
2293                         for (i = 0; i < drhd->devices_cnt; i++)
2294                                 if (drhd->devices[i] != NULL)
2295                                         break;
2296                         /* ignore DMAR unit if no pci devices exist */
2297                         if (i == drhd->devices_cnt)
2298                                 drhd->ignored = 1;
2299                 }
2300         }
2301
2302         if (dmar_map_gfx)
2303                 return;
2304
2305         for_each_drhd_unit(drhd) {
2306                 int i;
2307                 if (drhd->ignored || drhd->include_all)
2308                         continue;
2309
2310                 for (i = 0; i < drhd->devices_cnt; i++)
2311                         if (drhd->devices[i] &&
2312                                 !IS_GFX_DEVICE(drhd->devices[i]))
2313                                 break;
2314
2315                 if (i < drhd->devices_cnt)
2316                         continue;
2317
2318                 /* bypass IOMMU if it is just for gfx devices */
2319                 drhd->ignored = 1;
2320                 for (i = 0; i < drhd->devices_cnt; i++) {
2321                         if (!drhd->devices[i])
2322                                 continue;
2323                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2324                 }
2325         }
2326 }
2327
2328 int __init intel_iommu_init(void)
2329 {
2330         int ret = 0;
2331
2332         if (dmar_table_init())
2333                 return  -ENODEV;
2334
2335         if (dmar_dev_scope_init())
2336                 return  -ENODEV;
2337
2338         /*
2339          * Check the need for DMA-remapping initialization now.
2340          * Above initialization will also be used by Interrupt-remapping.
2341          */
2342         if (no_iommu || swiotlb || dmar_disabled)
2343                 return -ENODEV;
2344
2345         iommu_init_mempool();
2346         dmar_init_reserved_ranges();
2347
2348         init_no_remapping_devices();
2349
2350         ret = init_dmars();
2351         if (ret) {
2352                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2353                 put_iova_domain(&reserved_iova_list);
2354                 iommu_exit_mempool();
2355                 return ret;
2356         }
2357         printk(KERN_INFO
2358         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2359
2360         init_timer(&unmap_timer);
2361         force_iommu = 1;
2362         dma_ops = &intel_dma_ops;
2363         return 0;
2364 }
2365
2366 void intel_iommu_domain_exit(struct dmar_domain *domain)
2367 {
2368         u64 end;
2369
2370         /* Domain 0 is reserved, so dont process it */
2371         if (!domain)
2372                 return;
2373
2374         end = DOMAIN_MAX_ADDR(domain->gaw);
2375         end = end & (~VTD_PAGE_MASK);
2376
2377         /* clear ptes */
2378         dma_pte_clear_range(domain, 0, end);
2379
2380         /* free page tables */
2381         dma_pte_free_pagetable(domain, 0, end);
2382
2383         iommu_free_domain(domain);
2384         free_domain_mem(domain);
2385 }
2386 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2387
2388 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2389 {
2390         struct dmar_drhd_unit *drhd;
2391         struct dmar_domain *domain;
2392         struct intel_iommu *iommu;
2393
2394         drhd = dmar_find_matched_drhd_unit(pdev);
2395         if (!drhd) {
2396                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2397                 return NULL;
2398         }
2399
2400         iommu = drhd->iommu;
2401         if (!iommu) {
2402                 printk(KERN_ERR
2403                         "intel_iommu_domain_alloc: iommu == NULL\n");
2404                 return NULL;
2405         }
2406         domain = iommu_alloc_domain(iommu);
2407         if (!domain) {
2408                 printk(KERN_ERR
2409                         "intel_iommu_domain_alloc: domain == NULL\n");
2410                 return NULL;
2411         }
2412         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2413                 printk(KERN_ERR
2414                         "intel_iommu_domain_alloc: domain_init() failed\n");
2415                 intel_iommu_domain_exit(domain);
2416                 return NULL;
2417         }
2418         return domain;
2419 }
2420 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2421
2422 int intel_iommu_context_mapping(
2423         struct dmar_domain *domain, struct pci_dev *pdev)
2424 {
2425         int rc;
2426         rc = domain_context_mapping(domain, pdev);
2427         return rc;
2428 }
2429 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2430
2431 int intel_iommu_page_mapping(
2432         struct dmar_domain *domain, dma_addr_t iova,
2433         u64 hpa, size_t size, int prot)
2434 {
2435         int rc;
2436         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2437         return rc;
2438 }
2439 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2440
2441 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2442 {
2443         detach_domain_for_dev(domain, bus, devfn);
2444 }
2445 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2446
2447 struct dmar_domain *
2448 intel_iommu_find_domain(struct pci_dev *pdev)
2449 {
2450         return find_domain(pdev);
2451 }
2452 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2453
2454 int intel_iommu_found(void)
2455 {
2456         return g_num_of_iommus;
2457 }
2458 EXPORT_SYMBOL_GPL(intel_iommu_found);
2459
2460 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2461 {
2462         struct dma_pte *pte;
2463         u64 pfn;
2464
2465         pfn = 0;
2466         pte = addr_to_dma_pte(domain, iova);
2467
2468         if (pte)
2469                 pfn = dma_pte_addr(*pte);
2470
2471         return pfn >> VTD_PAGE_SHIFT;
2472 }
2473 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);