Merge remote-tracking branch 'spi/topic/davinci' into spi-next
[firefly-linux-kernel-4.4.55.git] / arch / powerpc / platforms / pseries / iommu.c
1 /*
2  * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
3  *
4  * Rewrite, cleanup:
5  *
6  * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
7  * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
8  *
9  * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
10  *
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
25  */
26
27 #include <linux/init.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/mm.h>
31 #include <linux/memblock.h>
32 #include <linux/spinlock.h>
33 #include <linux/string.h>
34 #include <linux/pci.h>
35 #include <linux/dma-mapping.h>
36 #include <linux/crash_dump.h>
37 #include <linux/memory.h>
38 #include <linux/of.h>
39 #include <linux/iommu.h>
40 #include <linux/rculist.h>
41 #include <asm/io.h>
42 #include <asm/prom.h>
43 #include <asm/rtas.h>
44 #include <asm/iommu.h>
45 #include <asm/pci-bridge.h>
46 #include <asm/machdep.h>
47 #include <asm/firmware.h>
48 #include <asm/tce.h>
49 #include <asm/ppc-pci.h>
50 #include <asm/udbg.h>
51 #include <asm/mmzone.h>
52 #include <asm/plpar_wrappers.h>
53
54 #include "pseries.h"
55
56 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
57 {
58         struct iommu_table_group *table_group = NULL;
59         struct iommu_table *tbl = NULL;
60         struct iommu_table_group_link *tgl = NULL;
61
62         table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
63                            node);
64         if (!table_group)
65                 goto fail_exit;
66
67         tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
68         if (!tbl)
69                 goto fail_exit;
70
71         tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
72                         node);
73         if (!tgl)
74                 goto fail_exit;
75
76         INIT_LIST_HEAD_RCU(&tbl->it_group_list);
77         tgl->table_group = table_group;
78         list_add_rcu(&tgl->next, &tbl->it_group_list);
79
80         table_group->tables[0] = tbl;
81
82         return table_group;
83
84 fail_exit:
85         kfree(tgl);
86         kfree(table_group);
87         kfree(tbl);
88
89         return NULL;
90 }
91
92 static void iommu_pseries_free_group(struct iommu_table_group *table_group,
93                 const char *node_name)
94 {
95         struct iommu_table *tbl;
96 #ifdef CONFIG_IOMMU_API
97         struct iommu_table_group_link *tgl;
98 #endif
99
100         if (!table_group)
101                 return;
102
103         tbl = table_group->tables[0];
104 #ifdef CONFIG_IOMMU_API
105         tgl = list_first_entry_or_null(&tbl->it_group_list,
106                         struct iommu_table_group_link, next);
107
108         WARN_ON_ONCE(!tgl);
109         if (tgl) {
110                 list_del_rcu(&tgl->next);
111                 kfree(tgl);
112         }
113         if (table_group->group) {
114                 iommu_group_put(table_group->group);
115                 BUG_ON(table_group->group);
116         }
117 #endif
118         iommu_free_table(tbl, node_name);
119
120         kfree(table_group);
121 }
122
123 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
124                                       __be64 *startp, __be64 *endp)
125 {
126         u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index;
127         unsigned long start, end, inc;
128
129         start = __pa(startp);
130         end = __pa(endp);
131         inc = L1_CACHE_BYTES; /* invalidate a cacheline of TCEs at a time */
132
133         /* If this is non-zero, change the format.  We shift the
134          * address and or in the magic from the device tree. */
135         if (tbl->it_busno) {
136                 start <<= 12;
137                 end <<= 12;
138                 inc <<= 12;
139                 start |= tbl->it_busno;
140                 end |= tbl->it_busno;
141         }
142
143         end |= inc - 1; /* round up end to be different than start */
144
145         mb(); /* Make sure TCEs in memory are written */
146         while (start <= end) {
147                 out_be64(invalidate, start);
148                 start += inc;
149         }
150 }
151
152 static int tce_build_pSeries(struct iommu_table *tbl, long index,
153                               long npages, unsigned long uaddr,
154                               enum dma_data_direction direction,
155                               struct dma_attrs *attrs)
156 {
157         u64 proto_tce;
158         __be64 *tcep, *tces;
159         u64 rpn;
160
161         proto_tce = TCE_PCI_READ; // Read allowed
162
163         if (direction != DMA_TO_DEVICE)
164                 proto_tce |= TCE_PCI_WRITE;
165
166         tces = tcep = ((__be64 *)tbl->it_base) + index;
167
168         while (npages--) {
169                 /* can't move this out since we might cross MEMBLOCK boundary */
170                 rpn = __pa(uaddr) >> TCE_SHIFT;
171                 *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
172
173                 uaddr += TCE_PAGE_SIZE;
174                 tcep++;
175         }
176
177         if (tbl->it_type & TCE_PCI_SWINV_CREATE)
178                 tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
179         return 0;
180 }
181
182
183 static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
184 {
185         __be64 *tcep, *tces;
186
187         tces = tcep = ((__be64 *)tbl->it_base) + index;
188
189         while (npages--)
190                 *(tcep++) = 0;
191
192         if (tbl->it_type & TCE_PCI_SWINV_FREE)
193                 tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
194 }
195
196 static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
197 {
198         __be64 *tcep;
199
200         tcep = ((__be64 *)tbl->it_base) + index;
201
202         return be64_to_cpu(*tcep);
203 }
204
205 static void tce_free_pSeriesLP(struct iommu_table*, long, long);
206 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
207
208 static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
209                                 long npages, unsigned long uaddr,
210                                 enum dma_data_direction direction,
211                                 struct dma_attrs *attrs)
212 {
213         u64 rc = 0;
214         u64 proto_tce, tce;
215         u64 rpn;
216         int ret = 0;
217         long tcenum_start = tcenum, npages_start = npages;
218
219         rpn = __pa(uaddr) >> TCE_SHIFT;
220         proto_tce = TCE_PCI_READ;
221         if (direction != DMA_TO_DEVICE)
222                 proto_tce |= TCE_PCI_WRITE;
223
224         while (npages--) {
225                 tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
226                 rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
227
228                 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
229                         ret = (int)rc;
230                         tce_free_pSeriesLP(tbl, tcenum_start,
231                                            (npages_start - (npages + 1)));
232                         break;
233                 }
234
235                 if (rc && printk_ratelimit()) {
236                         printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
237                         printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
238                         printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
239                         printk("\ttce val = 0x%llx\n", tce );
240                         dump_stack();
241                 }
242
243                 tcenum++;
244                 rpn++;
245         }
246         return ret;
247 }
248
249 static DEFINE_PER_CPU(__be64 *, tce_page);
250
251 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
252                                      long npages, unsigned long uaddr,
253                                      enum dma_data_direction direction,
254                                      struct dma_attrs *attrs)
255 {
256         u64 rc = 0;
257         u64 proto_tce;
258         __be64 *tcep;
259         u64 rpn;
260         long l, limit;
261         long tcenum_start = tcenum, npages_start = npages;
262         int ret = 0;
263         unsigned long flags;
264
265         if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
266                 return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
267                                            direction, attrs);
268         }
269
270         local_irq_save(flags);  /* to protect tcep and the page behind it */
271
272         tcep = __this_cpu_read(tce_page);
273
274         /* This is safe to do since interrupts are off when we're called
275          * from iommu_alloc{,_sg}()
276          */
277         if (!tcep) {
278                 tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
279                 /* If allocation fails, fall back to the loop implementation */
280                 if (!tcep) {
281                         local_irq_restore(flags);
282                         return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
283                                             direction, attrs);
284                 }
285                 __this_cpu_write(tce_page, tcep);
286         }
287
288         rpn = __pa(uaddr) >> TCE_SHIFT;
289         proto_tce = TCE_PCI_READ;
290         if (direction != DMA_TO_DEVICE)
291                 proto_tce |= TCE_PCI_WRITE;
292
293         /* We can map max one pageful of TCEs at a time */
294         do {
295                 /*
296                  * Set up the page with TCE data, looping through and setting
297                  * the values.
298                  */
299                 limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
300
301                 for (l = 0; l < limit; l++) {
302                         tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
303                         rpn++;
304                 }
305
306                 rc = plpar_tce_put_indirect((u64)tbl->it_index,
307                                             (u64)tcenum << 12,
308                                             (u64)__pa(tcep),
309                                             limit);
310
311                 npages -= limit;
312                 tcenum += limit;
313         } while (npages > 0 && !rc);
314
315         local_irq_restore(flags);
316
317         if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
318                 ret = (int)rc;
319                 tce_freemulti_pSeriesLP(tbl, tcenum_start,
320                                         (npages_start - (npages + limit)));
321                 return ret;
322         }
323
324         if (rc && printk_ratelimit()) {
325                 printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
326                 printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
327                 printk("\tnpages  = 0x%llx\n", (u64)npages);
328                 printk("\ttce[0] val = 0x%llx\n", tcep[0]);
329                 dump_stack();
330         }
331         return ret;
332 }
333
334 static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
335 {
336         u64 rc;
337
338         while (npages--) {
339                 rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
340
341                 if (rc && printk_ratelimit()) {
342                         printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
343                         printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
344                         printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
345                         dump_stack();
346                 }
347
348                 tcenum++;
349         }
350 }
351
352
353 static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
354 {
355         u64 rc;
356
357         if (!firmware_has_feature(FW_FEATURE_MULTITCE))
358                 return tce_free_pSeriesLP(tbl, tcenum, npages);
359
360         rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
361
362         if (rc && printk_ratelimit()) {
363                 printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
364                 printk("\trc      = %lld\n", rc);
365                 printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
366                 printk("\tnpages  = 0x%llx\n", (u64)npages);
367                 dump_stack();
368         }
369 }
370
371 static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
372 {
373         u64 rc;
374         unsigned long tce_ret;
375
376         rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
377
378         if (rc && printk_ratelimit()) {
379                 printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
380                 printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
381                 printk("\ttcenum  = 0x%llx\n", (u64)tcenum);
382                 dump_stack();
383         }
384
385         return tce_ret;
386 }
387
388 /* this is compatible with cells for the device tree property */
389 struct dynamic_dma_window_prop {
390         __be32  liobn;          /* tce table number */
391         __be64  dma_base;       /* address hi,lo */
392         __be32  tce_shift;      /* ilog2(tce_page_size) */
393         __be32  window_shift;   /* ilog2(tce_window_size) */
394 };
395
396 struct direct_window {
397         struct device_node *device;
398         const struct dynamic_dma_window_prop *prop;
399         struct list_head list;
400 };
401
402 /* Dynamic DMA Window support */
403 struct ddw_query_response {
404         u32 windows_available;
405         u32 largest_available_block;
406         u32 page_size;
407         u32 migration_capable;
408 };
409
410 struct ddw_create_response {
411         u32 liobn;
412         u32 addr_hi;
413         u32 addr_lo;
414 };
415
416 static LIST_HEAD(direct_window_list);
417 /* prevents races between memory on/offline and window creation */
418 static DEFINE_SPINLOCK(direct_window_list_lock);
419 /* protects initializing window twice for same device */
420 static DEFINE_MUTEX(direct_window_init_mutex);
421 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
422
423 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
424                                         unsigned long num_pfn, const void *arg)
425 {
426         const struct dynamic_dma_window_prop *maprange = arg;
427         int rc;
428         u64 tce_size, num_tce, dma_offset, next;
429         u32 tce_shift;
430         long limit;
431
432         tce_shift = be32_to_cpu(maprange->tce_shift);
433         tce_size = 1ULL << tce_shift;
434         next = start_pfn << PAGE_SHIFT;
435         num_tce = num_pfn << PAGE_SHIFT;
436
437         /* round back to the beginning of the tce page size */
438         num_tce += next & (tce_size - 1);
439         next &= ~(tce_size - 1);
440
441         /* covert to number of tces */
442         num_tce |= tce_size - 1;
443         num_tce >>= tce_shift;
444
445         do {
446                 /*
447                  * Set up the page with TCE data, looping through and setting
448                  * the values.
449                  */
450                 limit = min_t(long, num_tce, 512);
451                 dma_offset = next + be64_to_cpu(maprange->dma_base);
452
453                 rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
454                                              dma_offset,
455                                              0, limit);
456                 next += limit * tce_size;
457                 num_tce -= limit;
458         } while (num_tce > 0 && !rc);
459
460         return rc;
461 }
462
463 static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
464                                         unsigned long num_pfn, const void *arg)
465 {
466         const struct dynamic_dma_window_prop *maprange = arg;
467         u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
468         __be64 *tcep;
469         u32 tce_shift;
470         u64 rc = 0;
471         long l, limit;
472
473         local_irq_disable();    /* to protect tcep and the page behind it */
474         tcep = __this_cpu_read(tce_page);
475
476         if (!tcep) {
477                 tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
478                 if (!tcep) {
479                         local_irq_enable();
480                         return -ENOMEM;
481                 }
482                 __this_cpu_write(tce_page, tcep);
483         }
484
485         proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
486
487         liobn = (u64)be32_to_cpu(maprange->liobn);
488         tce_shift = be32_to_cpu(maprange->tce_shift);
489         tce_size = 1ULL << tce_shift;
490         next = start_pfn << PAGE_SHIFT;
491         num_tce = num_pfn << PAGE_SHIFT;
492
493         /* round back to the beginning of the tce page size */
494         num_tce += next & (tce_size - 1);
495         next &= ~(tce_size - 1);
496
497         /* covert to number of tces */
498         num_tce |= tce_size - 1;
499         num_tce >>= tce_shift;
500
501         /* We can map max one pageful of TCEs at a time */
502         do {
503                 /*
504                  * Set up the page with TCE data, looping through and setting
505                  * the values.
506                  */
507                 limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
508                 dma_offset = next + be64_to_cpu(maprange->dma_base);
509
510                 for (l = 0; l < limit; l++) {
511                         tcep[l] = cpu_to_be64(proto_tce | next);
512                         next += tce_size;
513                 }
514
515                 rc = plpar_tce_put_indirect(liobn,
516                                             dma_offset,
517                                             (u64)__pa(tcep),
518                                             limit);
519
520                 num_tce -= limit;
521         } while (num_tce > 0 && !rc);
522
523         /* error cleanup: caller will clear whole range */
524
525         local_irq_enable();
526         return rc;
527 }
528
529 static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
530                 unsigned long num_pfn, void *arg)
531 {
532         return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
533 }
534
535 #ifdef CONFIG_PCI
536 static void iommu_table_setparms(struct pci_controller *phb,
537                                  struct device_node *dn,
538                                  struct iommu_table *tbl)
539 {
540         struct device_node *node;
541         const unsigned long *basep, *sw_inval;
542         const u32 *sizep;
543
544         node = phb->dn;
545
546         basep = of_get_property(node, "linux,tce-base", NULL);
547         sizep = of_get_property(node, "linux,tce-size", NULL);
548         if (basep == NULL || sizep == NULL) {
549                 printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "
550                                 "missing tce entries !\n", dn->full_name);
551                 return;
552         }
553
554         tbl->it_base = (unsigned long)__va(*basep);
555
556         if (!is_kdump_kernel())
557                 memset((void *)tbl->it_base, 0, *sizep);
558
559         tbl->it_busno = phb->bus->number;
560         tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
561
562         /* Units of tce entries */
563         tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
564
565         /* Test if we are going over 2GB of DMA space */
566         if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
567                 udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
568                 panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
569         }
570
571         phb->dma_window_base_cur += phb->dma_window_size;
572
573         /* Set the tce table size - measured in entries */
574         tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
575
576         tbl->it_index = 0;
577         tbl->it_blocksize = 16;
578         tbl->it_type = TCE_PCI;
579
580         sw_inval = of_get_property(node, "linux,tce-sw-invalidate-info", NULL);
581         if (sw_inval) {
582                 /*
583                  * This property contains information on how to
584                  * invalidate the TCE entry.  The first property is
585                  * the base MMIO address used to invalidate entries.
586                  * The second property tells us the format of the TCE
587                  * invalidate (whether it needs to be shifted) and
588                  * some magic routing info to add to our invalidate
589                  * command.
590                  */
591                 tbl->it_index = (unsigned long) ioremap(sw_inval[0], 8);
592                 tbl->it_busno = sw_inval[1]; /* overload this with magic */
593                 tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE;
594         }
595 }
596
597 /*
598  * iommu_table_setparms_lpar
599  *
600  * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
601  */
602 static void iommu_table_setparms_lpar(struct pci_controller *phb,
603                                       struct device_node *dn,
604                                       struct iommu_table *tbl,
605                                       const __be32 *dma_window)
606 {
607         unsigned long offset, size;
608
609         of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
610
611         tbl->it_busno = phb->bus->number;
612         tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
613         tbl->it_base   = 0;
614         tbl->it_blocksize  = 16;
615         tbl->it_type = TCE_PCI;
616         tbl->it_offset = offset >> tbl->it_page_shift;
617         tbl->it_size = size >> tbl->it_page_shift;
618 }
619
620 struct iommu_table_ops iommu_table_pseries_ops = {
621         .set = tce_build_pSeries,
622         .clear = tce_free_pSeries,
623         .get = tce_get_pseries
624 };
625
626 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
627 {
628         struct device_node *dn;
629         struct iommu_table *tbl;
630         struct device_node *isa_dn, *isa_dn_orig;
631         struct device_node *tmp;
632         struct pci_dn *pci;
633         int children;
634
635         dn = pci_bus_to_OF_node(bus);
636
637         pr_debug("pci_dma_bus_setup_pSeries: setting up bus %s\n", dn->full_name);
638
639         if (bus->self) {
640                 /* This is not a root bus, any setup will be done for the
641                  * device-side of the bridge in iommu_dev_setup_pSeries().
642                  */
643                 return;
644         }
645         pci = PCI_DN(dn);
646
647         /* Check if the ISA bus on the system is under
648          * this PHB.
649          */
650         isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
651
652         while (isa_dn && isa_dn != dn)
653                 isa_dn = isa_dn->parent;
654
655         of_node_put(isa_dn_orig);
656
657         /* Count number of direct PCI children of the PHB. */
658         for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
659                 children++;
660
661         pr_debug("Children: %d\n", children);
662
663         /* Calculate amount of DMA window per slot. Each window must be
664          * a power of two (due to pci_alloc_consistent requirements).
665          *
666          * Keep 256MB aside for PHBs with ISA.
667          */
668
669         if (!isa_dn) {
670                 /* No ISA/IDE - just set window size and return */
671                 pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
672
673                 while (pci->phb->dma_window_size * children > 0x80000000ul)
674                         pci->phb->dma_window_size >>= 1;
675                 pr_debug("No ISA/IDE, window size is 0x%llx\n",
676                          pci->phb->dma_window_size);
677                 pci->phb->dma_window_base_cur = 0;
678
679                 return;
680         }
681
682         /* If we have ISA, then we probably have an IDE
683          * controller too. Allocate a 128MB table but
684          * skip the first 128MB to avoid stepping on ISA
685          * space.
686          */
687         pci->phb->dma_window_size = 0x8000000ul;
688         pci->phb->dma_window_base_cur = 0x8000000ul;
689
690         pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
691         tbl = pci->table_group->tables[0];
692
693         iommu_table_setparms(pci->phb, dn, tbl);
694         tbl->it_ops = &iommu_table_pseries_ops;
695         iommu_init_table(tbl, pci->phb->node);
696         iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
697
698         /* Divide the rest (1.75GB) among the children */
699         pci->phb->dma_window_size = 0x80000000ul;
700         while (pci->phb->dma_window_size * children > 0x70000000ul)
701                 pci->phb->dma_window_size >>= 1;
702
703         pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
704 }
705
706 struct iommu_table_ops iommu_table_lpar_multi_ops = {
707         .set = tce_buildmulti_pSeriesLP,
708         .clear = tce_freemulti_pSeriesLP,
709         .get = tce_get_pSeriesLP
710 };
711
712 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
713 {
714         struct iommu_table *tbl;
715         struct device_node *dn, *pdn;
716         struct pci_dn *ppci;
717         const __be32 *dma_window = NULL;
718
719         dn = pci_bus_to_OF_node(bus);
720
721         pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %s\n",
722                  dn->full_name);
723
724         /* Find nearest ibm,dma-window, walking up the device tree */
725         for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
726                 dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
727                 if (dma_window != NULL)
728                         break;
729         }
730
731         if (dma_window == NULL) {
732                 pr_debug("  no ibm,dma-window property !\n");
733                 return;
734         }
735
736         ppci = PCI_DN(pdn);
737
738         pr_debug("  parent is %s, iommu_table: 0x%p\n",
739                  pdn->full_name, ppci->table_group);
740
741         if (!ppci->table_group) {
742                 ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
743                 tbl = ppci->table_group->tables[0];
744                 iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
745                 tbl->it_ops = &iommu_table_lpar_multi_ops;
746                 iommu_init_table(tbl, ppci->phb->node);
747                 iommu_register_group(ppci->table_group,
748                                 pci_domain_nr(bus), 0);
749                 pr_debug("  created table: %p\n", ppci->table_group);
750         }
751 }
752
753
754 static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
755 {
756         struct device_node *dn;
757         struct iommu_table *tbl;
758
759         pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
760
761         dn = dev->dev.of_node;
762
763         /* If we're the direct child of a root bus, then we need to allocate
764          * an iommu table ourselves. The bus setup code should have setup
765          * the window sizes already.
766          */
767         if (!dev->bus->self) {
768                 struct pci_controller *phb = PCI_DN(dn)->phb;
769
770                 pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
771                 PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
772                 tbl = PCI_DN(dn)->table_group->tables[0];
773                 iommu_table_setparms(phb, dn, tbl);
774                 tbl->it_ops = &iommu_table_pseries_ops;
775                 iommu_init_table(tbl, phb->node);
776                 iommu_register_group(PCI_DN(dn)->table_group,
777                                 pci_domain_nr(phb->bus), 0);
778                 set_iommu_table_base(&dev->dev, tbl);
779                 iommu_add_device(&dev->dev);
780                 return;
781         }
782
783         /* If this device is further down the bus tree, search upwards until
784          * an already allocated iommu table is found and use that.
785          */
786
787         while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
788                 dn = dn->parent;
789
790         if (dn && PCI_DN(dn)) {
791                 set_iommu_table_base(&dev->dev,
792                                 PCI_DN(dn)->table_group->tables[0]);
793                 iommu_add_device(&dev->dev);
794         } else
795                 printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
796                        pci_name(dev));
797 }
798
799 static int __read_mostly disable_ddw;
800
801 static int __init disable_ddw_setup(char *str)
802 {
803         disable_ddw = 1;
804         printk(KERN_INFO "ppc iommu: disabling ddw.\n");
805
806         return 0;
807 }
808
809 early_param("disable_ddw", disable_ddw_setup);
810
811 static void remove_ddw(struct device_node *np, bool remove_prop)
812 {
813         struct dynamic_dma_window_prop *dwp;
814         struct property *win64;
815         u32 ddw_avail[3];
816         u64 liobn;
817         int ret = 0;
818
819         ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
820                                          &ddw_avail[0], 3);
821
822         win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
823         if (!win64)
824                 return;
825
826         if (ret || win64->length < sizeof(*dwp))
827                 goto delprop;
828
829         dwp = win64->value;
830         liobn = (u64)be32_to_cpu(dwp->liobn);
831
832         /* clear the whole window, note the arg is in kernel pages */
833         ret = tce_clearrange_multi_pSeriesLP(0,
834                 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
835         if (ret)
836                 pr_warning("%s failed to clear tces in window.\n",
837                          np->full_name);
838         else
839                 pr_debug("%s successfully cleared tces in window.\n",
840                          np->full_name);
841
842         ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
843         if (ret)
844                 pr_warning("%s: failed to remove direct window: rtas returned "
845                         "%d to ibm,remove-pe-dma-window(%x) %llx\n",
846                         np->full_name, ret, ddw_avail[2], liobn);
847         else
848                 pr_debug("%s: successfully removed direct window: rtas returned "
849                         "%d to ibm,remove-pe-dma-window(%x) %llx\n",
850                         np->full_name, ret, ddw_avail[2], liobn);
851
852 delprop:
853         if (remove_prop)
854                 ret = of_remove_property(np, win64);
855         if (ret)
856                 pr_warning("%s: failed to remove direct window property: %d\n",
857                         np->full_name, ret);
858 }
859
860 static u64 find_existing_ddw(struct device_node *pdn)
861 {
862         struct direct_window *window;
863         const struct dynamic_dma_window_prop *direct64;
864         u64 dma_addr = 0;
865
866         spin_lock(&direct_window_list_lock);
867         /* check if we already created a window and dupe that config if so */
868         list_for_each_entry(window, &direct_window_list, list) {
869                 if (window->device == pdn) {
870                         direct64 = window->prop;
871                         dma_addr = be64_to_cpu(direct64->dma_base);
872                         break;
873                 }
874         }
875         spin_unlock(&direct_window_list_lock);
876
877         return dma_addr;
878 }
879
880 static int find_existing_ddw_windows(void)
881 {
882         int len;
883         struct device_node *pdn;
884         struct direct_window *window;
885         const struct dynamic_dma_window_prop *direct64;
886
887         if (!firmware_has_feature(FW_FEATURE_LPAR))
888                 return 0;
889
890         for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
891                 direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
892                 if (!direct64)
893                         continue;
894
895                 window = kzalloc(sizeof(*window), GFP_KERNEL);
896                 if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
897                         kfree(window);
898                         remove_ddw(pdn, true);
899                         continue;
900                 }
901
902                 window->device = pdn;
903                 window->prop = direct64;
904                 spin_lock(&direct_window_list_lock);
905                 list_add(&window->list, &direct_window_list);
906                 spin_unlock(&direct_window_list_lock);
907         }
908
909         return 0;
910 }
911 machine_arch_initcall(pseries, find_existing_ddw_windows);
912
913 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
914                         struct ddw_query_response *query)
915 {
916         struct eeh_dev *edev;
917         u32 cfg_addr;
918         u64 buid;
919         int ret;
920
921         /*
922          * Get the config address and phb buid of the PE window.
923          * Rely on eeh to retrieve this for us.
924          * Retrieve them from the pci device, not the node with the
925          * dma-window property
926          */
927         edev = pci_dev_to_eeh_dev(dev);
928         cfg_addr = edev->config_addr;
929         if (edev->pe_config_addr)
930                 cfg_addr = edev->pe_config_addr;
931         buid = edev->phb->buid;
932
933         ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
934                   cfg_addr, BUID_HI(buid), BUID_LO(buid));
935         dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
936                 " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
937                 BUID_LO(buid), ret);
938         return ret;
939 }
940
941 static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
942                         struct ddw_create_response *create, int page_shift,
943                         int window_shift)
944 {
945         struct eeh_dev *edev;
946         u32 cfg_addr;
947         u64 buid;
948         int ret;
949
950         /*
951          * Get the config address and phb buid of the PE window.
952          * Rely on eeh to retrieve this for us.
953          * Retrieve them from the pci device, not the node with the
954          * dma-window property
955          */
956         edev = pci_dev_to_eeh_dev(dev);
957         cfg_addr = edev->config_addr;
958         if (edev->pe_config_addr)
959                 cfg_addr = edev->pe_config_addr;
960         buid = edev->phb->buid;
961
962         do {
963                 /* extra outputs are LIOBN and dma-addr (hi, lo) */
964                 ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
965                                 cfg_addr, BUID_HI(buid), BUID_LO(buid),
966                                 page_shift, window_shift);
967         } while (rtas_busy_delay(ret));
968         dev_info(&dev->dev,
969                 "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
970                 "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
971                  cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
972                  window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
973
974         return ret;
975 }
976
977 struct failed_ddw_pdn {
978         struct device_node *pdn;
979         struct list_head list;
980 };
981
982 static LIST_HEAD(failed_ddw_pdn_list);
983
984 /*
985  * If the PE supports dynamic dma windows, and there is space for a table
986  * that can map all pages in a linear offset, then setup such a table,
987  * and record the dma-offset in the struct device.
988  *
989  * dev: the pci device we are checking
990  * pdn: the parent pe node with the ibm,dma_window property
991  * Future: also check if we can remap the base window for our base page size
992  *
993  * returns the dma offset for use by dma_set_mask
994  */
995 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
996 {
997         int len, ret;
998         struct ddw_query_response query;
999         struct ddw_create_response create;
1000         int page_shift;
1001         u64 dma_addr, max_addr;
1002         struct device_node *dn;
1003         u32 ddw_avail[3];
1004         struct direct_window *window;
1005         struct property *win64;
1006         struct dynamic_dma_window_prop *ddwprop;
1007         struct failed_ddw_pdn *fpdn;
1008
1009         mutex_lock(&direct_window_init_mutex);
1010
1011         dma_addr = find_existing_ddw(pdn);
1012         if (dma_addr != 0)
1013                 goto out_unlock;
1014
1015         /*
1016          * If we already went through this for a previous function of
1017          * the same device and failed, we don't want to muck with the
1018          * DMA window again, as it will race with in-flight operations
1019          * and can lead to EEHs. The above mutex protects access to the
1020          * list.
1021          */
1022         list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
1023                 if (!strcmp(fpdn->pdn->full_name, pdn->full_name))
1024                         goto out_unlock;
1025         }
1026
1027         /*
1028          * the ibm,ddw-applicable property holds the tokens for:
1029          * ibm,query-pe-dma-window
1030          * ibm,create-pe-dma-window
1031          * ibm,remove-pe-dma-window
1032          * for the given node in that order.
1033          * the property is actually in the parent, not the PE
1034          */
1035         ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
1036                                          &ddw_avail[0], 3);
1037         if (ret)
1038                 goto out_failed;
1039
1040        /*
1041          * Query if there is a second window of size to map the
1042          * whole partition.  Query returns number of windows, largest
1043          * block assigned to PE (partition endpoint), and two bitmasks
1044          * of page sizes: supported and supported for migrate-dma.
1045          */
1046         dn = pci_device_to_OF_node(dev);
1047         ret = query_ddw(dev, ddw_avail, &query);
1048         if (ret != 0)
1049                 goto out_failed;
1050
1051         if (query.windows_available == 0) {
1052                 /*
1053                  * no additional windows are available for this device.
1054                  * We might be able to reallocate the existing window,
1055                  * trading in for a larger page size.
1056                  */
1057                 dev_dbg(&dev->dev, "no free dynamic windows");
1058                 goto out_failed;
1059         }
1060         if (query.page_size & 4) {
1061                 page_shift = 24; /* 16MB */
1062         } else if (query.page_size & 2) {
1063                 page_shift = 16; /* 64kB */
1064         } else if (query.page_size & 1) {
1065                 page_shift = 12; /* 4kB */
1066         } else {
1067                 dev_dbg(&dev->dev, "no supported direct page size in mask %x",
1068                           query.page_size);
1069                 goto out_failed;
1070         }
1071         /* verify the window * number of ptes will map the partition */
1072         /* check largest block * page size > max memory hotplug addr */
1073         max_addr = memory_hotplug_max();
1074         if (query.largest_available_block < (max_addr >> page_shift)) {
1075                 dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
1076                           "%llu-sized pages\n", max_addr,  query.largest_available_block,
1077                           1ULL << page_shift);
1078                 goto out_failed;
1079         }
1080         len = order_base_2(max_addr);
1081         win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
1082         if (!win64) {
1083                 dev_info(&dev->dev,
1084                         "couldn't allocate property for 64bit dma window\n");
1085                 goto out_failed;
1086         }
1087         win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
1088         win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
1089         win64->length = sizeof(*ddwprop);
1090         if (!win64->name || !win64->value) {
1091                 dev_info(&dev->dev,
1092                         "couldn't allocate property name and value\n");
1093                 goto out_free_prop;
1094         }
1095
1096         ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
1097         if (ret != 0)
1098                 goto out_free_prop;
1099
1100         ddwprop->liobn = cpu_to_be32(create.liobn);
1101         ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
1102                         create.addr_lo);
1103         ddwprop->tce_shift = cpu_to_be32(page_shift);
1104         ddwprop->window_shift = cpu_to_be32(len);
1105
1106         dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n",
1107                   create.liobn, dn->full_name);
1108
1109         window = kzalloc(sizeof(*window), GFP_KERNEL);
1110         if (!window)
1111                 goto out_clear_window;
1112
1113         ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
1114                         win64->value, tce_setrange_multi_pSeriesLP_walk);
1115         if (ret) {
1116                 dev_info(&dev->dev, "failed to map direct window for %s: %d\n",
1117                          dn->full_name, ret);
1118                 goto out_free_window;
1119         }
1120
1121         ret = of_add_property(pdn, win64);
1122         if (ret) {
1123                 dev_err(&dev->dev, "unable to add dma window property for %s: %d",
1124                          pdn->full_name, ret);
1125                 goto out_free_window;
1126         }
1127
1128         window->device = pdn;
1129         window->prop = ddwprop;
1130         spin_lock(&direct_window_list_lock);
1131         list_add(&window->list, &direct_window_list);
1132         spin_unlock(&direct_window_list_lock);
1133
1134         dma_addr = be64_to_cpu(ddwprop->dma_base);
1135         goto out_unlock;
1136
1137 out_free_window:
1138         kfree(window);
1139
1140 out_clear_window:
1141         remove_ddw(pdn, true);
1142
1143 out_free_prop:
1144         kfree(win64->name);
1145         kfree(win64->value);
1146         kfree(win64);
1147
1148 out_failed:
1149
1150         fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
1151         if (!fpdn)
1152                 goto out_unlock;
1153         fpdn->pdn = pdn;
1154         list_add(&fpdn->list, &failed_ddw_pdn_list);
1155
1156 out_unlock:
1157         mutex_unlock(&direct_window_init_mutex);
1158         return dma_addr;
1159 }
1160
1161 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
1162 {
1163         struct device_node *pdn, *dn;
1164         struct iommu_table *tbl;
1165         const __be32 *dma_window = NULL;
1166         struct pci_dn *pci;
1167
1168         pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
1169
1170         /* dev setup for LPAR is a little tricky, since the device tree might
1171          * contain the dma-window properties per-device and not necessarily
1172          * for the bus. So we need to search upwards in the tree until we
1173          * either hit a dma-window property, OR find a parent with a table
1174          * already allocated.
1175          */
1176         dn = pci_device_to_OF_node(dev);
1177         pr_debug("  node is %s\n", dn->full_name);
1178
1179         for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
1180              pdn = pdn->parent) {
1181                 dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1182                 if (dma_window)
1183                         break;
1184         }
1185
1186         if (!pdn || !PCI_DN(pdn)) {
1187                 printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
1188                        "no DMA window found for pci dev=%s dn=%s\n",
1189                                  pci_name(dev), of_node_full_name(dn));
1190                 return;
1191         }
1192         pr_debug("  parent is %s\n", pdn->full_name);
1193
1194         pci = PCI_DN(pdn);
1195         if (!pci->table_group) {
1196                 pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
1197                 tbl = pci->table_group->tables[0];
1198                 iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
1199                 tbl->it_ops = &iommu_table_lpar_multi_ops;
1200                 iommu_init_table(tbl, pci->phb->node);
1201                 iommu_register_group(pci->table_group,
1202                                 pci_domain_nr(pci->phb->bus), 0);
1203                 pr_debug("  created table: %p\n", pci->table_group);
1204         } else {
1205                 pr_debug("  found DMA window, table: %p\n", pci->table_group);
1206         }
1207
1208         set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
1209         iommu_add_device(&dev->dev);
1210 }
1211
1212 static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
1213 {
1214         bool ddw_enabled = false;
1215         struct device_node *pdn, *dn;
1216         struct pci_dev *pdev;
1217         const __be32 *dma_window = NULL;
1218         u64 dma_offset;
1219
1220         if (!dev->dma_mask)
1221                 return -EIO;
1222
1223         if (!dev_is_pci(dev))
1224                 goto check_mask;
1225
1226         pdev = to_pci_dev(dev);
1227
1228         /* only attempt to use a new window if 64-bit DMA is requested */
1229         if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
1230                 dn = pci_device_to_OF_node(pdev);
1231                 dev_dbg(dev, "node is %s\n", dn->full_name);
1232
1233                 /*
1234                  * the device tree might contain the dma-window properties
1235                  * per-device and not necessarily for the bus. So we need to
1236                  * search upwards in the tree until we either hit a dma-window
1237                  * property, OR find a parent with a table already allocated.
1238                  */
1239                 for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
1240                                 pdn = pdn->parent) {
1241                         dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1242                         if (dma_window)
1243                                 break;
1244                 }
1245                 if (pdn && PCI_DN(pdn)) {
1246                         dma_offset = enable_ddw(pdev, pdn);
1247                         if (dma_offset != 0) {
1248                                 dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
1249                                 set_dma_offset(dev, dma_offset);
1250                                 set_dma_ops(dev, &dma_direct_ops);
1251                                 ddw_enabled = true;
1252                         }
1253                 }
1254         }
1255
1256         /* fall back on iommu ops */
1257         if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {
1258                 dev_info(dev, "Restoring 32-bit DMA via iommu\n");
1259                 set_dma_ops(dev, &dma_iommu_ops);
1260         }
1261
1262 check_mask:
1263         if (!dma_supported(dev, dma_mask))
1264                 return -EIO;
1265
1266         *dev->dma_mask = dma_mask;
1267         return 0;
1268 }
1269
1270 static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
1271 {
1272         if (!dev->dma_mask)
1273                 return 0;
1274
1275         if (!disable_ddw && dev_is_pci(dev)) {
1276                 struct pci_dev *pdev = to_pci_dev(dev);
1277                 struct device_node *dn;
1278
1279                 dn = pci_device_to_OF_node(pdev);
1280
1281                 /* search upwards for ibm,dma-window */
1282                 for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
1283                                 dn = dn->parent)
1284                         if (of_get_property(dn, "ibm,dma-window", NULL))
1285                                 break;
1286                 /* if there is a ibm,ddw-applicable property require 64 bits */
1287                 if (dn && PCI_DN(dn) &&
1288                                 of_get_property(dn, "ibm,ddw-applicable", NULL))
1289                         return DMA_BIT_MASK(64);
1290         }
1291
1292         return dma_iommu_ops.get_required_mask(dev);
1293 }
1294
1295 #else  /* CONFIG_PCI */
1296 #define pci_dma_bus_setup_pSeries       NULL
1297 #define pci_dma_dev_setup_pSeries       NULL
1298 #define pci_dma_bus_setup_pSeriesLP     NULL
1299 #define pci_dma_dev_setup_pSeriesLP     NULL
1300 #define dma_set_mask_pSeriesLP          NULL
1301 #define dma_get_required_mask_pSeriesLP NULL
1302 #endif /* !CONFIG_PCI */
1303
1304 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
1305                 void *data)
1306 {
1307         struct direct_window *window;
1308         struct memory_notify *arg = data;
1309         int ret = 0;
1310
1311         switch (action) {
1312         case MEM_GOING_ONLINE:
1313                 spin_lock(&direct_window_list_lock);
1314                 list_for_each_entry(window, &direct_window_list, list) {
1315                         ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
1316                                         arg->nr_pages, window->prop);
1317                         /* XXX log error */
1318                 }
1319                 spin_unlock(&direct_window_list_lock);
1320                 break;
1321         case MEM_CANCEL_ONLINE:
1322         case MEM_OFFLINE:
1323                 spin_lock(&direct_window_list_lock);
1324                 list_for_each_entry(window, &direct_window_list, list) {
1325                         ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
1326                                         arg->nr_pages, window->prop);
1327                         /* XXX log error */
1328                 }
1329                 spin_unlock(&direct_window_list_lock);
1330                 break;
1331         default:
1332                 break;
1333         }
1334         if (ret && action != MEM_CANCEL_ONLINE)
1335                 return NOTIFY_BAD;
1336
1337         return NOTIFY_OK;
1338 }
1339
1340 static struct notifier_block iommu_mem_nb = {
1341         .notifier_call = iommu_mem_notifier,
1342 };
1343
1344 static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
1345 {
1346         int err = NOTIFY_OK;
1347         struct of_reconfig_data *rd = data;
1348         struct device_node *np = rd->dn;
1349         struct pci_dn *pci = PCI_DN(np);
1350         struct direct_window *window;
1351
1352         switch (action) {
1353         case OF_RECONFIG_DETACH_NODE:
1354                 /*
1355                  * Removing the property will invoke the reconfig
1356                  * notifier again, which causes dead-lock on the
1357                  * read-write semaphore of the notifier chain. So
1358                  * we have to remove the property when releasing
1359                  * the device node.
1360                  */
1361                 remove_ddw(np, false);
1362                 if (pci && pci->table_group)
1363                         iommu_pseries_free_group(pci->table_group,
1364                                         np->full_name);
1365
1366                 spin_lock(&direct_window_list_lock);
1367                 list_for_each_entry(window, &direct_window_list, list) {
1368                         if (window->device == np) {
1369                                 list_del(&window->list);
1370                                 kfree(window);
1371                                 break;
1372                         }
1373                 }
1374                 spin_unlock(&direct_window_list_lock);
1375                 break;
1376         default:
1377                 err = NOTIFY_DONE;
1378                 break;
1379         }
1380         return err;
1381 }
1382
1383 static struct notifier_block iommu_reconfig_nb = {
1384         .notifier_call = iommu_reconfig_notifier,
1385 };
1386
1387 /* These are called very early. */
1388 void iommu_init_early_pSeries(void)
1389 {
1390         if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
1391                 return;
1392
1393         if (firmware_has_feature(FW_FEATURE_LPAR)) {
1394                 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
1395                 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
1396                 ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
1397                 ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
1398         } else {
1399                 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
1400                 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
1401         }
1402
1403
1404         of_reconfig_notifier_register(&iommu_reconfig_nb);
1405         register_memory_notifier(&iommu_mem_nb);
1406
1407         set_pci_dma_ops(&dma_iommu_ops);
1408 }
1409
1410 static int __init disable_multitce(char *str)
1411 {
1412         if (strcmp(str, "off") == 0 &&
1413             firmware_has_feature(FW_FEATURE_LPAR) &&
1414             firmware_has_feature(FW_FEATURE_MULTITCE)) {
1415                 printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
1416                 powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
1417         }
1418         return 1;
1419 }
1420
1421 __setup("multitce=", disable_multitce);
1422
1423 machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);