From: David Woodhouse Date: Wed, 9 Sep 2015 10:40:47 +0000 (+0100) Subject: iommu/vt-d: Add basic SVM PASID support X-Git-Tag: firefly_0821_release~176^2~782^2~17 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=2f26e0a9c9860db290d63e9d85c2c8c09813677f;p=firefly-linux-kernel-4.4.55.git iommu/vt-d: Add basic SVM PASID support This provides basic PASID support for endpoint devices, tested with a version of the i915 driver. Signed-off-by: David Woodhouse --- diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 8d23f5ed8ae2..be18b214c31e 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -139,6 +139,7 @@ config INTEL_IOMMU_SVM bool "Support for Shared Virtual Memory with Intel IOMMU" depends on INTEL_IOMMU && X86 select PCI_PASID + select MMU_NOTIFIER help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 9995ea84e23a..60b66d27e655 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4929,6 +4929,110 @@ static void intel_iommu_remove_device(struct device *dev) iommu_device_unlink(iommu->iommu_dev, dev); } +#ifdef CONFIG_INTEL_IOMMU_SVM +int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev) +{ + struct device_domain_info *info; + struct context_entry *context; + struct dmar_domain *domain; + unsigned long flags; + u64 ctx_lo; + int ret; + + domain = get_valid_domain_for_dev(sdev->dev); + if (!domain) + return -EINVAL; + + spin_lock_irqsave(&device_domain_lock, flags); + spin_lock(&iommu->lock); + + ret = -EINVAL; + info = sdev->dev->archdata.iommu; + if (!info || !info->pasid_supported) + goto out; + + context = iommu_context_addr(iommu, info->bus, info->devfn, 0); + if (WARN_ON(!context)) + goto out; + + ctx_lo = context[0].lo; + + sdev->did = domain->iommu_did[iommu->seq_id]; + sdev->sid = PCI_DEVID(info->bus, info->devfn); + + if (!(ctx_lo & CONTEXT_PASIDE)) { + context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table); + context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap); + wmb(); + /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both + * extended to permit requests-with-PASID if the PASIDE bit + * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH, + * however, the PASIDE bit is ignored and requests-with-PASID + * are unconditionally blocked. Which makes less sense. + * So convert from CONTEXT_TT_PASS_THROUGH to one of the new + * "guest mode" translation types depending on whether ATS + * is available or not. Annoyingly, we can't use the new + * modes *unless* PASIDE is set. */ + if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) { + ctx_lo &= ~CONTEXT_TT_MASK; + if (info->ats_supported) + ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2; + else + ctx_lo |= CONTEXT_TT_PT_PASID << 2; + } + ctx_lo |= CONTEXT_PASIDE; + context[0].lo = ctx_lo; + wmb(); + iommu->flush.flush_context(iommu, sdev->did, sdev->sid, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + } + + /* Enable PASID support in the device, if it wasn't already */ + if (!info->pasid_enabled) + iommu_enable_dev_iotlb(info); + + if (info->ats_enabled) { + sdev->dev_iotlb = 1; + sdev->qdep = info->ats_qdep; + if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) + sdev->qdep = 0; + } + ret = 0; + + out: + spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&device_domain_lock, flags); + + return ret; +} + +struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) +{ + struct intel_iommu *iommu; + u8 bus, devfn; + + if (iommu_dummy(dev)) { + dev_warn(dev, + "No IOMMU translation for device; cannot enable SVM\n"); + return NULL; + } + + iommu = device_to_iommu(dev, &bus, &devfn); + if ((!iommu)) { + dev_dbg(dev, "No IOMMU for device; cannot enable SVM\n"); + return NULL; + } + + if (!iommu->pasid_table) { + dev_dbg(dev, "PASID not enabled on IOMMU; cannot enable SVM\n"); + return NULL; + } + + return iommu; +} +#endif /* CONFIG_INTEL_IOMMU_SVM */ + static const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .domain_alloc = intel_iommu_domain_alloc, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 5b42a95b3f80..82d53e15b865 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -14,6 +14,17 @@ */ #include +#include +#include +#include +#include +#include +#include +#include + +struct pasid_entry { + u64 val; +}; int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu) { @@ -42,6 +53,8 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu) iommu->name); } + idr_init(&iommu->pasid_idr); + return 0; } @@ -61,5 +74,283 @@ int intel_svm_free_pasid_tables(struct intel_iommu *iommu) free_pages((unsigned long)iommu->pasid_state_table, order); iommu->pasid_state_table = NULL; } + idr_destroy(&iommu->pasid_idr); return 0; } + +static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, + unsigned long address, int pages, int ih) +{ + struct qi_desc desc; + int mask = ilog2(__roundup_pow_of_two(pages)); + + if (pages == -1 || !cap_pgsel_inv(svm->iommu->cap) || + mask > cap_max_amask_val(svm->iommu->cap)) { + desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE; + desc.high = 0; + } else { + desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | QI_EIOTLB_TYPE; + desc.high = QI_EIOTLB_ADDR(address) | QI_EIOTLB_GL(1) | + QI_EIOTLB_IH(ih) | QI_EIOTLB_AM(mask); + } + + qi_submit_sync(&desc, svm->iommu); + + if (sdev->dev_iotlb) { + desc.low = QI_DEV_EIOTLB_PASID(svm->pasid) | QI_DEV_EIOTLB_SID(sdev->sid) | + QI_DEV_EIOTLB_QDEP(sdev->qdep) | QI_DEIOTLB_TYPE; + if (mask) { + unsigned long adr, delta; + + /* Least significant zero bits in the address indicate the + * range of the request. So mask them out according to the + * size. */ + adr = address & ((1<<(VTD_PAGE_SHIFT + mask)) - 1); + + /* Now ensure that we round down further if the original + * request was not aligned w.r.t. its size */ + delta = address - adr; + if (delta + (pages << VTD_PAGE_SHIFT) >= (1 << (VTD_PAGE_SHIFT + mask))) + adr &= ~(1 << (VTD_PAGE_SHIFT + mask)); + desc.high = QI_DEV_EIOTLB_ADDR(adr) | QI_DEV_EIOTLB_SIZE; + } else { + desc.high = QI_DEV_EIOTLB_ADDR(address); + } + qi_submit_sync(&desc, svm->iommu); + } +} + +static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, + int pages, int ih) +{ + struct intel_svm_dev *sdev; + + rcu_read_lock(); + list_for_each_entry_rcu(sdev, &svm->devs, list) + intel_flush_svm_range_dev(svm, sdev, address, pages, ih); + rcu_read_unlock(); +} + +static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long address, pte_t pte) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + intel_flush_svm_range(svm, address, 1, 1); +} + +static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long address) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + intel_flush_svm_range(svm, address, 1, 1); +} + +/* Pages have been freed at this point */ +static void intel_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + intel_flush_svm_range(svm, start, + (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT , 0); +} + + +static void intel_flush_pasid_dev(struct intel_svm *svm, struct intel_svm_dev *sdev) +{ + struct qi_desc desc; + + desc.high = 0; + desc.low = QI_PC_TYPE | QI_PC_DID(sdev->did) | QI_PC_PASID_SEL | QI_PC_PASID(svm->pasid); + + qi_submit_sync(&desc, svm->iommu); +} + +static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + svm->iommu->pasid_table[svm->pasid].val = 0; + + /* There's no need to do any flush because we can't get here if there + * are any devices left anyway. */ + WARN_ON(!list_empty(&svm->devs)); +} + +static const struct mmu_notifier_ops intel_mmuops = { + .release = intel_mm_release, + .change_pte = intel_change_pte, + .invalidate_page = intel_invalidate_page, + .invalidate_range = intel_invalidate_range, +}; + +static DEFINE_MUTEX(pasid_mutex); + +int intel_svm_bind_mm(struct device *dev, int *pasid) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct intel_svm_dev *sdev; + struct intel_svm *svm = NULL; + int pasid_max; + int ret; + + BUG_ON(pasid && !current->mm); + + if (WARN_ON(!iommu)) + return -EINVAL; + + if (dev_is_pci(dev)) { + pasid_max = pci_max_pasids(to_pci_dev(dev)); + if (pasid_max < 0) + return -EINVAL; + } else + pasid_max = 1 << 20; + + mutex_lock(&pasid_mutex); + if (pasid) { + int i; + + idr_for_each_entry(&iommu->pasid_idr, svm, i) { + if (svm->mm != current->mm) + continue; + + if (svm->pasid >= pasid_max) { + dev_warn(dev, + "Limited PASID width. Cannot use existing PASID %d\n", + svm->pasid); + ret = -ENOSPC; + goto out; + } + + list_for_each_entry(sdev, &svm->devs, list) { + if (dev == sdev->dev) { + sdev->users++; + goto success; + } + } + + break; + } + } + + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!sdev) { + ret = -ENOMEM; + goto out; + } + sdev->dev = dev; + + ret = intel_iommu_enable_pasid(iommu, sdev); + if (ret || !pasid) { + /* If they don't actually want to assign a PASID, this is + * just an enabling check/preparation. */ + kfree(sdev); + goto out; + } + /* Finish the setup now we know we're keeping it */ + sdev->users = 1; + init_rcu_head(&sdev->rcu); + + if (!svm) { + svm = kzalloc(sizeof(*svm), GFP_KERNEL); + if (!svm) { + ret = -ENOMEM; + kfree(sdev); + goto out; + } + svm->iommu = iommu; + + if (pasid_max > 2 << ecap_pss(iommu->ecap)) + pasid_max = 2 << ecap_pss(iommu->ecap); + + ret = idr_alloc(&iommu->pasid_idr, svm, 0, pasid_max - 1, + GFP_KERNEL); + if (ret < 0) { + kfree(svm); + goto out; + } + svm->pasid = ret; + svm->notifier.ops = &intel_mmuops; + svm->mm = get_task_mm(current); + INIT_LIST_HEAD_RCU(&svm->devs); + ret = -ENOMEM; + if (!svm->mm || (ret = mmu_notifier_register(&svm->notifier, svm->mm))) { + idr_remove(&svm->iommu->pasid_idr, svm->pasid); + kfree(svm); + kfree(sdev); + goto out; + } + iommu->pasid_table[svm->pasid].val = (u64)__pa(svm->mm->pgd) | 1; + wmb(); + } + list_add_rcu(&sdev->list, &svm->devs); + + success: + *pasid = svm->pasid; + ret = 0; + out: + mutex_unlock(&pasid_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(intel_svm_bind_mm); + +int intel_svm_unbind_mm(struct device *dev, int pasid) +{ + struct intel_svm_dev *sdev; + struct intel_iommu *iommu; + struct intel_svm *svm; + int ret = -EINVAL; + + mutex_lock(&pasid_mutex); + iommu = intel_svm_device_to_iommu(dev); + if (!iommu || !iommu->pasid_table) + goto out; + + svm = idr_find(&iommu->pasid_idr, pasid); + if (!svm) + goto out; + + list_for_each_entry(sdev, &svm->devs, list) { + if (dev == sdev->dev) { + ret = 0; + sdev->users--; + if (!sdev->users) { + list_del_rcu(&sdev->list); + /* Flush the PASID cache and IOTLB for this device. + * Note that we do depend on the hardware *not* using + * the PASID any more. Just as we depend on other + * devices never using PASIDs that they have no right + * to use. We have a *shared* PASID table, because it's + * large and has to be physically contiguous. So it's + * hard to be as defensive as we might like. */ + intel_flush_pasid_dev(svm, sdev); + intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); + kfree_rcu(sdev, rcu); + + if (list_empty(&svm->devs)) { + mmu_notifier_unregister(&svm->notifier, svm->mm); + + idr_remove(&svm->iommu->pasid_idr, svm->pasid); + mmput(svm->mm); + /* We mandate that no page faults may be outstanding + * for the PASID when intel_svm_unbind_mm() is called. + * If that is not obeyed, subtle errors will happen. + * Let's make them less subtle... */ + memset(svm, 0x6b, sizeof(*svm)); + kfree(svm); + } + } + break; + } + } + out: + mutex_unlock(&pasid_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(intel_svm_unbind_mm); diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 7ac17f57250e..0e114bfabeed 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -20,6 +20,13 @@ #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 +/* Extended context entry types */ +#define CONTEXT_TT_PT_PASID 4 +#define CONTEXT_TT_PT_PASID_DEV_IOTLB 5 +#define CONTEXT_TT_MASK (7ULL << 2) + +#define CONTEXT_PRS (1ULL << 9) +#define CONTEXT_PASIDE (1ULL << 11) struct intel_iommu; struct dmar_domain; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 0f38e60d40ad..46add607567b 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -1,5 +1,9 @@ /* - * Copyright (c) 2006, Intel Corporation. + * Copyright © 2006-2015, Intel Corporation. + * + * Authors: Ashok Raj + * Anil S Keshavamurthy + * David Woodhouse * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -13,10 +17,6 @@ * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Author: Ashok Raj - * Author: Anil S Keshavamurthy */ #ifndef _INTEL_IOMMU_H_ @@ -25,7 +25,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -251,6 +254,9 @@ enum { #define QI_DIOTLB_TYPE 0x3 #define QI_IEC_TYPE 0x4 #define QI_IWD_TYPE 0x5 +#define QI_EIOTLB_TYPE 0x6 +#define QI_PC_TYPE 0x7 +#define QI_DEIOTLB_TYPE 0x8 #define QI_IEC_SELECTIVE (((u64)1) << 4) #define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32)) @@ -278,6 +284,34 @@ enum { #define QI_DEV_IOTLB_SIZE 1 #define QI_DEV_IOTLB_MAX_INVS 32 +#define QI_PC_PASID(pasid) (((u64)pasid) << 32) +#define QI_PC_DID(did) (((u64)did) << 16) +#define QI_PC_GRAN(gran) (((u64)gran) << 4) + +#define QI_PC_ALL_PASIDS (QI_PC_TYPE | QI_PC_GRAN(0)) +#define QI_PC_PASID_SEL (QI_PC_TYPE | QI_PC_GRAN(1)) + +#define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) +#define QI_EIOTLB_GL(gl) (((u64)gl) << 7) +#define QI_EIOTLB_IH(ih) (((u64)ih) << 6) +#define QI_EIOTLB_AM(am) (((u64)am)) +#define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) +#define QI_EIOTLB_DID(did) (((u64)did) << 16) +#define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) + +#define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) +#define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) +#define QI_DEV_EIOTLB_GLOB(g) ((u64)g) +#define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32) +#define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) +#define QI_DEV_EIOTLB_QDEP(qd) (((qd) & 0x1f) << 16) +#define QI_DEV_EIOTLB_MAX_INVS 32 + +#define QI_GRAN_ALL_ALL 0 +#define QI_GRAN_NONG_ALL 1 +#define QI_GRAN_NONG_PASID 2 +#define QI_GRAN_PSI_PASID 3 + struct qi_desc { u64 low, high; }; @@ -359,6 +393,7 @@ struct intel_iommu { * told to. But while it's all driver-arbitrated, we're fine. */ struct pasid_entry *pasid_table; struct pasid_state_entry *pasid_state_table; + struct idr pasid_idr; #endif struct q_inval *qi; /* Queued invalidation info */ u32 *iommu_state; /* Store iommu states between suspend and resume.*/ @@ -399,9 +434,32 @@ extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); +#ifdef CONFIG_INTEL_IOMMU_SVM extern int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu); extern int intel_svm_free_pasid_tables(struct intel_iommu *iommu); +struct intel_svm_dev { + struct list_head list; + struct rcu_head rcu; + struct device *dev; + int users; + u16 did; + u16 dev_iotlb:1; + u16 sid, qdep; +}; + +struct intel_svm { + struct mmu_notifier notifier; + struct mm_struct *mm; + struct intel_iommu *iommu; + int pasid; + struct list_head devs; +}; + +extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev); +extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev); +#endif + extern const struct attribute_group *intel_iommu_groups[]; #endif diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h new file mode 100644 index 000000000000..42f80ad7a7a0 --- /dev/null +++ b/include/linux/intel-svm.h @@ -0,0 +1,82 @@ +/* + * Copyright © 2015 Intel Corporation. + * + * Authors: David Woodhouse + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef __INTEL_SVM_H__ +#define __INTEL_SVM_H__ + +#ifdef CONFIG_INTEL_IOMMU_SVM + +struct device; + +/** + * intel_svm_bind_mm() - Bind the current process to a PASID + * @dev: Device to be granted acccess + * @pasid: Address for allocated PASID + * + * This function attempts to enable PASID support for the given device. + * If the @pasid argument is non-%NULL, a PASID is allocated for access + * to the MM of the current process. + * + * By using a %NULL value for the @pasid argument, this function can + * be used to simply validate that PASID support is available for the + * given device — i.e. that it is behind an IOMMU which has the + * requisite support, and is enabled. + * + * Page faults are handled transparently by the IOMMU code, and there + * should be no need for the device driver to be involved. If a page + * fault cannot be handled (i.e. is an invalid address rather than + * just needs paging in), then the page request will be completed by + * the core IOMMU code with appropriate status, and the device itself + * can then report the resulting fault to its driver via whatever + * mechanism is appropriate. + * + * Multiple calls from the same process may result in the same PASID + * being re-used. A reference count is kept. + */ +extern int intel_svm_bind_mm(struct device *dev, int *pasid); + +/** + * intel_svm_unbind_mm() - Unbind a specified PASID + * @dev: Device for which PASID was allocated + * @pasid: PASID value to be unbound + * + * This function allows a PASID to be retired when the device no + * longer requires access to the address space of a given process. + * + * If the use count for the PASID in question reaches zero, the + * PASID is revoked and may no longer be used by hardware. + * + * Device drivers are required to ensure that no access (including + * page requests) is currently outstanding for the PASID in question, + * before calling this function. + */ +extern int intel_svm_unbind_mm(struct device *dev, int pasid); + +#else /* CONFIG_INTEL_IOMMU_SVM */ + +static inline int intel_svm_bind_mm(struct device *dev, int *pasid) +{ + return -ENOSYS; +} + +static inline int intel_svm_unbind_mm(struct device *dev, int pasid) +{ + BUG(); +} +#endif /* CONFIG_INTEL_IOMMU_SVM */ + +#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL)) + +#endif /* __INTEL_SVM_H__ */