misc: mic: SCIF remote memory map/unmap interface
authorSudeep Dutt <sudeep.dutt@intel.com>
Wed, 30 Sep 2015 01:15:55 +0000 (18:15 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 4 Oct 2015 11:54:54 +0000 (12:54 +0100)
This patch implements the SCIF mmap/munmap interface. A similar
capability is provided to kernel clients via the
scif_get_pages()/scif_put_pages() APIs. The SCIF mmap interface
queries to check if a window is valid and then remaps the local
virtual address to the remote physical pages. These mappings are
subsequently destroyed upon receipt of the VMA close operation or
scif_get_pages().  This functionality allows SCIF users to directly
access remote memory without any driver interaction once the mappings
are created thereby providing bare-metal PCIe latency. These mappings
are zapped to avoid RMA accesses from user space, if a Coprocessor is
reset.

Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Reviewed-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/misc/mic/scif/scif_mmap.c [new file with mode: 0644]

diff --git a/drivers/misc/mic/scif/scif_mmap.c b/drivers/misc/mic/scif/scif_mmap.c
new file mode 100644 (file)
index 0000000..49cb8f7
--- /dev/null
@@ -0,0 +1,699 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#include "scif_main.h"
+
+/*
+ * struct scif_vma_info - Information about a remote memory mapping
+ *                       created via scif_mmap(..)
+ * @vma: VM area struct
+ * @list: link to list of active vmas
+ */
+struct scif_vma_info {
+       struct vm_area_struct *vma;
+       struct list_head list;
+};
+
+void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg)
+{
+       struct scif_rma_req req;
+       struct scif_window *window = NULL;
+       struct scif_window *recv_window =
+               (struct scif_window *)msg->payload[0];
+       struct scif_endpt *ep;
+
+       ep = (struct scif_endpt *)recv_window->ep;
+       req.out_window = &window;
+       req.offset = recv_window->offset;
+       req.prot = recv_window->prot;
+       req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT;
+       req.type = SCIF_WINDOW_FULL;
+       req.head = &ep->rma_info.reg_list;
+       msg->payload[0] = ep->remote_ep;
+
+       mutex_lock(&ep->rma_info.rma_lock);
+       /* Does a valid window exist? */
+       if (scif_query_window(&req)) {
+               dev_err(&scifdev->sdev->dev,
+                       "%s %d -ENXIO\n", __func__, __LINE__);
+               msg->uop = SCIF_UNREGISTER_ACK;
+               goto error;
+       }
+
+       scif_put_window(window, window->nr_pages);
+
+       if (!window->ref_count) {
+               atomic_inc(&ep->rma_info.tw_refcount);
+               ep->rma_info.async_list_del = 1;
+               list_del_init(&window->list);
+               scif_free_window_offset(ep, window, window->offset);
+       }
+error:
+       mutex_unlock(&ep->rma_info.rma_lock);
+       if (window && !window->ref_count)
+               scif_queue_for_cleanup(window, &scif_info.rma);
+}
+
+/*
+ * Remove valid remote memory mappings created via scif_mmap(..) from the
+ * process address space since the remote node is lost
+ */
+static void __scif_zap_mmaps(struct scif_endpt *ep)
+{
+       struct list_head *item;
+       struct scif_vma_info *info;
+       struct vm_area_struct *vma;
+       unsigned long size;
+
+       spin_lock(&ep->lock);
+       list_for_each(item, &ep->rma_info.vma_list) {
+               info = list_entry(item, struct scif_vma_info, list);
+               vma = info->vma;
+               size = vma->vm_end - vma->vm_start;
+               zap_vma_ptes(vma, vma->vm_start, size);
+               dev_dbg(scif_info.mdev.this_device,
+                       "%s ep %p zap vma %p size 0x%lx\n",
+                       __func__, ep, info->vma, size);
+       }
+       spin_unlock(&ep->lock);
+}
+
+/*
+ * Traverse the list of endpoints for a particular remote node and
+ * zap valid remote memory mappings since the remote node is lost
+ */
+static void _scif_zap_mmaps(int node, struct list_head *head)
+{
+       struct scif_endpt *ep;
+       struct list_head *item;
+
+       mutex_lock(&scif_info.connlock);
+       list_for_each(item, head) {
+               ep = list_entry(item, struct scif_endpt, list);
+               if (ep->remote_dev->node == node)
+                       __scif_zap_mmaps(ep);
+       }
+       mutex_unlock(&scif_info.connlock);
+}
+
+/*
+ * Wrapper for removing remote memory mappings for a particular node. This API
+ * is called by peer nodes as part of handling a lost node.
+ */
+void scif_zap_mmaps(int node)
+{
+       _scif_zap_mmaps(node, &scif_info.connected);
+       _scif_zap_mmaps(node, &scif_info.disconnected);
+}
+
+/*
+ * This API is only called while handling a lost node:
+ * a) Remote node is dead.
+ * b) Remote memory mappings have been zapped
+ * So we can traverse the remote_reg_list without any locks. Since
+ * the window has not yet been unregistered we can drop the ref count
+ * and queue it to the cleanup thread.
+ */
+static void __scif_cleanup_rma_for_zombies(struct scif_endpt *ep)
+{
+       struct list_head *pos, *tmp;
+       struct scif_window *window;
+
+       list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) {
+               window = list_entry(pos, struct scif_window, list);
+               if (window->ref_count)
+                       scif_put_window(window, window->nr_pages);
+               else
+                       dev_err(scif_info.mdev.this_device,
+                               "%s %d unexpected\n",
+                               __func__, __LINE__);
+               if (!window->ref_count) {
+                       atomic_inc(&ep->rma_info.tw_refcount);
+                       list_del_init(&window->list);
+                       scif_queue_for_cleanup(window, &scif_info.rma);
+               }
+       }
+}
+
+/* Cleanup remote registration lists for zombie endpoints */
+void scif_cleanup_rma_for_zombies(int node)
+{
+       struct scif_endpt *ep;
+       struct list_head *item;
+
+       mutex_lock(&scif_info.eplock);
+       list_for_each(item, &scif_info.zombie) {
+               ep = list_entry(item, struct scif_endpt, list);
+               if (ep->remote_dev && ep->remote_dev->node == node)
+                       __scif_cleanup_rma_for_zombies(ep);
+       }
+       mutex_unlock(&scif_info.eplock);
+       flush_work(&scif_info.misc_work);
+}
+
+/* Insert the VMA into the per endpoint VMA list */
+static int scif_insert_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
+{
+       struct scif_vma_info *info;
+       int err = 0;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info) {
+               err = -ENOMEM;
+               goto done;
+       }
+       info->vma = vma;
+       spin_lock(&ep->lock);
+       list_add_tail(&info->list, &ep->rma_info.vma_list);
+       spin_unlock(&ep->lock);
+done:
+       return err;
+}
+
+/* Delete the VMA from the per endpoint VMA list */
+static void scif_delete_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
+{
+       struct list_head *item;
+       struct scif_vma_info *info;
+
+       spin_lock(&ep->lock);
+       list_for_each(item, &ep->rma_info.vma_list) {
+               info = list_entry(item, struct scif_vma_info, list);
+               if (info->vma == vma) {
+                       list_del(&info->list);
+                       kfree(info);
+                       break;
+               }
+       }
+       spin_unlock(&ep->lock);
+}
+
+static phys_addr_t scif_get_phys(phys_addr_t phys, struct scif_endpt *ep)
+{
+       struct scif_dev *scifdev = (struct scif_dev *)ep->remote_dev;
+       struct scif_hw_dev *sdev = scifdev->sdev;
+       phys_addr_t out_phys, apt_base = 0;
+
+       /*
+        * If the DMA address is card relative then we need to add the
+        * aperture base for mmap to work correctly
+        */
+       if (!scifdev_self(scifdev) && sdev->aper && sdev->card_rel_da)
+               apt_base = sdev->aper->pa;
+       out_phys = apt_base + phys;
+       return out_phys;
+}
+
+int scif_get_pages(scif_epd_t epd, off_t offset, size_t len,
+                  struct scif_range **pages)
+{
+       struct scif_endpt *ep = (struct scif_endpt *)epd;
+       struct scif_rma_req req;
+       struct scif_window *window = NULL;
+       int nr_pages, err, i;
+
+       dev_dbg(scif_info.mdev.this_device,
+               "SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n",
+               ep, offset, len);
+       err = scif_verify_epd(ep);
+       if (err)
+               return err;
+
+       if (!len || (offset < 0) ||
+           (offset + len < offset) ||
+           (ALIGN(offset, PAGE_SIZE) != offset) ||
+           (ALIGN(len, PAGE_SIZE) != len))
+               return -EINVAL;
+
+       nr_pages = len >> PAGE_SHIFT;
+
+       req.out_window = &window;
+       req.offset = offset;
+       req.prot = 0;
+       req.nr_bytes = len;
+       req.type = SCIF_WINDOW_SINGLE;
+       req.head = &ep->rma_info.remote_reg_list;
+
+       mutex_lock(&ep->rma_info.rma_lock);
+       /* Does a valid window exist? */
+       err = scif_query_window(&req);
+       if (err) {
+               dev_err(&ep->remote_dev->sdev->dev,
+                       "%s %d err %d\n", __func__, __LINE__, err);
+               goto error;
+       }
+
+       /* Allocate scif_range */
+       *pages = kzalloc(sizeof(**pages), GFP_KERNEL);
+       if (!*pages) {
+               err = -ENOMEM;
+               goto error;
+       }
+
+       /* Allocate phys addr array */
+       (*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t));
+       if (!((*pages)->phys_addr)) {
+               err = -ENOMEM;
+               goto error;
+       }
+
+       if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) {
+               /* Allocate virtual address array */
+               ((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)));
+               if (!(*pages)->va) {
+                       err = -ENOMEM;
+                       goto error;
+               }
+       }
+       /* Populate the values */
+       (*pages)->cookie = window;
+       (*pages)->nr_pages = nr_pages;
+       (*pages)->prot_flags = window->prot;
+
+       for (i = 0; i < nr_pages; i++) {
+               (*pages)->phys_addr[i] =
+                       __scif_off_to_dma_addr(window, offset +
+                                              (i * PAGE_SIZE));
+               (*pages)->phys_addr[i] = scif_get_phys((*pages)->phys_addr[i],
+                                                       ep);
+               if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev))
+                       (*pages)->va[i] =
+                               ep->remote_dev->sdev->aper->va +
+                               (*pages)->phys_addr[i] -
+                               ep->remote_dev->sdev->aper->pa;
+       }
+
+       scif_get_window(window, nr_pages);
+error:
+       mutex_unlock(&ep->rma_info.rma_lock);
+       if (err) {
+               if (*pages) {
+                       scif_free((*pages)->phys_addr,
+                                 nr_pages * sizeof(dma_addr_t));
+                       scif_free((*pages)->va,
+                                 nr_pages * sizeof(void *));
+                       kfree(*pages);
+                       *pages = NULL;
+               }
+               dev_err(&ep->remote_dev->sdev->dev,
+                       "%s %d err %d\n", __func__, __LINE__, err);
+       }
+       return err;
+}
+EXPORT_SYMBOL_GPL(scif_get_pages);
+
+int scif_put_pages(struct scif_range *pages)
+{
+       struct scif_endpt *ep;
+       struct scif_window *window;
+       struct scifmsg msg;
+
+       if (!pages || !pages->cookie)
+               return -EINVAL;
+
+       window = pages->cookie;
+
+       if (!window || window->magic != SCIFEP_MAGIC)
+               return -EINVAL;
+
+       ep = (struct scif_endpt *)window->ep;
+       /*
+        * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
+        * callee should be allowed to release references to the pages,
+        * else the endpoint was not connected in the first place,
+        * hence the ENOTCONN.
+        */
+       if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED)
+               return -ENOTCONN;
+
+       mutex_lock(&ep->rma_info.rma_lock);
+
+       scif_put_window(window, pages->nr_pages);
+
+       /* Initiate window destruction if ref count is zero */
+       if (!window->ref_count) {
+               list_del(&window->list);
+               mutex_unlock(&ep->rma_info.rma_lock);
+               scif_drain_dma_intr(ep->remote_dev->sdev,
+                                   ep->rma_info.dma_chan);
+               /* Inform the peer about this window being destroyed. */
+               msg.uop = SCIF_MUNMAP;
+               msg.src = ep->port;
+               msg.payload[0] = window->peer_window;
+               /* No error handling for notification messages */
+               scif_nodeqp_send(ep->remote_dev, &msg);
+               /* Destroy this window from the peer's registered AS */
+               scif_destroy_remote_window(window);
+       } else {
+               mutex_unlock(&ep->rma_info.rma_lock);
+       }
+
+       scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t));
+       scif_free(pages->va, pages->nr_pages * sizeof(void *));
+       kfree(pages);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(scif_put_pages);
+
+/*
+ * scif_rma_list_mmap:
+ *
+ * Traverse the remote registration list starting from start_window:
+ * 1) Create VtoP mappings via remap_pfn_range(..)
+ * 2) Once step 1) and 2) complete successfully then traverse the range of
+ *    windows again and bump the reference count.
+ * RMA lock must be held.
+ */
+static int scif_rma_list_mmap(struct scif_window *start_window, s64 offset,
+                             int nr_pages, struct vm_area_struct *vma)
+{
+       s64 end_offset, loop_offset = offset;
+       struct scif_window *window = start_window;
+       int loop_nr_pages, nr_pages_left = nr_pages;
+       struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
+       struct list_head *head = &ep->rma_info.remote_reg_list;
+       int i, err = 0;
+       dma_addr_t phys_addr;
+       struct scif_window_iter src_win_iter;
+       size_t contig_bytes = 0;
+
+       might_sleep();
+       list_for_each_entry_from(window, head, list) {
+               end_offset = window->offset +
+                       (window->nr_pages << PAGE_SHIFT);
+               loop_nr_pages = min_t(int,
+                                     (end_offset - loop_offset) >> PAGE_SHIFT,
+                                     nr_pages_left);
+               scif_init_window_iter(window, &src_win_iter);
+               for (i = 0; i < loop_nr_pages; i++) {
+                       phys_addr = scif_off_to_dma_addr(window, loop_offset,
+                                                        &contig_bytes,
+                                                        &src_win_iter);
+                       phys_addr = scif_get_phys(phys_addr, ep);
+                       err = remap_pfn_range(vma,
+                                             vma->vm_start +
+                                             loop_offset - offset,
+                                             phys_addr >> PAGE_SHIFT,
+                                             PAGE_SIZE,
+                                             vma->vm_page_prot);
+                       if (err)
+                               goto error;
+                       loop_offset += PAGE_SIZE;
+               }
+               nr_pages_left -= loop_nr_pages;
+               if (!nr_pages_left)
+                       break;
+       }
+       /*
+        * No more failures expected. Bump up the ref count for all
+        * the windows. Another traversal from start_window required
+        * for handling errors encountered across windows during
+        * remap_pfn_range(..).
+        */
+       loop_offset = offset;
+       nr_pages_left = nr_pages;
+       window = start_window;
+       head = &ep->rma_info.remote_reg_list;
+       list_for_each_entry_from(window, head, list) {
+               end_offset = window->offset +
+                       (window->nr_pages << PAGE_SHIFT);
+               loop_nr_pages = min_t(int,
+                                     (end_offset - loop_offset) >> PAGE_SHIFT,
+                                     nr_pages_left);
+               scif_get_window(window, loop_nr_pages);
+               nr_pages_left -= loop_nr_pages;
+               loop_offset += (loop_nr_pages << PAGE_SHIFT);
+               if (!nr_pages_left)
+                       break;
+       }
+error:
+       if (err)
+               dev_err(scif_info.mdev.this_device,
+                       "%s %d err %d\n", __func__, __LINE__, err);
+       return err;
+}
+
+/*
+ * scif_rma_list_munmap:
+ *
+ * Traverse the remote registration list starting from window:
+ * 1) Decrement ref count.
+ * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer.
+ * RMA lock must be held.
+ */
+static void scif_rma_list_munmap(struct scif_window *start_window,
+                                s64 offset, int nr_pages)
+{
+       struct scifmsg msg;
+       s64 loop_offset = offset, end_offset;
+       int loop_nr_pages, nr_pages_left = nr_pages;
+       struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
+       struct list_head *head = &ep->rma_info.remote_reg_list;
+       struct scif_window *window = start_window, *_window;
+
+       msg.uop = SCIF_MUNMAP;
+       msg.src = ep->port;
+       loop_offset = offset;
+       nr_pages_left = nr_pages;
+       list_for_each_entry_safe_from(window, _window, head, list) {
+               end_offset = window->offset +
+                       (window->nr_pages << PAGE_SHIFT);
+               loop_nr_pages = min_t(int,
+                                     (end_offset - loop_offset) >> PAGE_SHIFT,
+                                     nr_pages_left);
+               scif_put_window(window, loop_nr_pages);
+               if (!window->ref_count) {
+                       struct scif_dev *rdev = ep->remote_dev;
+
+                       scif_drain_dma_intr(rdev->sdev,
+                                           ep->rma_info.dma_chan);
+                       /* Inform the peer about this munmap */
+                       msg.payload[0] = window->peer_window;
+                       /* No error handling for Notification messages. */
+                       scif_nodeqp_send(ep->remote_dev, &msg);
+                       list_del(&window->list);
+                       /* Destroy this window from the peer's registered AS */
+                       scif_destroy_remote_window(window);
+               }
+               nr_pages_left -= loop_nr_pages;
+               loop_offset += (loop_nr_pages << PAGE_SHIFT);
+               if (!nr_pages_left)
+                       break;
+       }
+}
+
+/*
+ * The private data field of each VMA used to mmap a remote window
+ * points to an instance of struct vma_pvt
+ */
+struct vma_pvt {
+       struct scif_endpt *ep;  /* End point for remote window */
+       s64 offset;             /* offset within remote window */
+       bool valid_offset;      /* offset is valid only if the original
+                                * mmap request was for a single page
+                                * else the offset within the vma is
+                                * the correct offset
+                                */
+       struct kref ref;
+};
+
+static void vma_pvt_release(struct kref *ref)
+{
+       struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref);
+
+       kfree(vmapvt);
+}
+
+/**
+ * scif_vma_open - VMA open driver callback
+ * @vma: VMM memory area.
+ * The open method is called by the kernel to allow the subsystem implementing
+ * the VMA to initialize the area. This method is invoked any time a new
+ * reference to the VMA is made (when a process forks, for example).
+ * The one exception happens when the VMA is first created by mmap;
+ * in this case, the driver's mmap method is called instead.
+ * This function is also invoked when an existing VMA is split by the kernel
+ * due to a call to munmap on a subset of the VMA resulting in two VMAs.
+ * The kernel invokes this function only on one of the two VMAs.
+ */
+static void scif_vma_open(struct vm_area_struct *vma)
+{
+       struct vma_pvt *vmapvt = vma->vm_private_data;
+
+       dev_dbg(scif_info.mdev.this_device,
+               "SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
+               vma->vm_start, vma->vm_end);
+       scif_insert_vma(vmapvt->ep, vma);
+       kref_get(&vmapvt->ref);
+}
+
+/**
+ * scif_munmap - VMA close driver callback.
+ * @vma: VMM memory area.
+ * When an area is destroyed, the kernel calls its close operation.
+ * Note that there's no usage count associated with VMA's; the area
+ * is opened and closed exactly once by each process that uses it.
+ */
+static void scif_munmap(struct vm_area_struct *vma)
+{
+       struct scif_endpt *ep;
+       struct vma_pvt *vmapvt = vma->vm_private_data;
+       int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+       s64 offset;
+       struct scif_rma_req req;
+       struct scif_window *window = NULL;
+       int err;
+
+       might_sleep();
+       dev_dbg(scif_info.mdev.this_device,
+               "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
+               vma->vm_start, vma->vm_end);
+       ep = vmapvt->ep;
+       offset = vmapvt->valid_offset ? vmapvt->offset :
+               (vma->vm_pgoff) << PAGE_SHIFT;
+       dev_dbg(scif_info.mdev.this_device,
+               "SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n",
+               ep, nr_pages, offset);
+       req.out_window = &window;
+       req.offset = offset;
+       req.nr_bytes = vma->vm_end - vma->vm_start;
+       req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
+       req.type = SCIF_WINDOW_PARTIAL;
+       req.head = &ep->rma_info.remote_reg_list;
+
+       mutex_lock(&ep->rma_info.rma_lock);
+
+       err = scif_query_window(&req);
+       if (err)
+               dev_err(scif_info.mdev.this_device,
+                       "%s %d err %d\n", __func__, __LINE__, err);
+       else
+               scif_rma_list_munmap(window, offset, nr_pages);
+
+       mutex_unlock(&ep->rma_info.rma_lock);
+       /*
+        * The kernel probably zeroes these out but we still want
+        * to clean up our own mess just in case.
+        */
+       vma->vm_ops = NULL;
+       vma->vm_private_data = NULL;
+       kref_put(&vmapvt->ref, vma_pvt_release);
+       scif_delete_vma(ep, vma);
+}
+
+static const struct vm_operations_struct scif_vm_ops = {
+       .open = scif_vma_open,
+       .close = scif_munmap,
+};
+
+/**
+ * scif_mmap - Map pages in virtual address space to a remote window.
+ * @vma: VMM memory area.
+ * @epd: endpoint descriptor
+ *
+ * Return: Upon successful completion, scif_mmap() returns zero
+ * else an apt error is returned as documented in scif.h
+ */
+int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd)
+{
+       struct scif_rma_req req;
+       struct scif_window *window = NULL;
+       struct scif_endpt *ep = (struct scif_endpt *)epd;
+       s64 start_offset = vma->vm_pgoff << PAGE_SHIFT;
+       int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+       int err;
+       struct vma_pvt *vmapvt;
+
+       dev_dbg(scif_info.mdev.this_device,
+               "SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n",
+               ep, start_offset, nr_pages);
+       err = scif_verify_epd(ep);
+       if (err)
+               return err;
+
+       might_sleep();
+
+       err = scif_insert_vma(ep, vma);
+       if (err)
+               return err;
+
+       vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL);
+       if (!vmapvt) {
+               scif_delete_vma(ep, vma);
+               return -ENOMEM;
+       }
+
+       vmapvt->ep = ep;
+       kref_init(&vmapvt->ref);
+
+       req.out_window = &window;
+       req.offset = start_offset;
+       req.nr_bytes = vma->vm_end - vma->vm_start;
+       req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
+       req.type = SCIF_WINDOW_PARTIAL;
+       req.head = &ep->rma_info.remote_reg_list;
+
+       mutex_lock(&ep->rma_info.rma_lock);
+       /* Does a valid window exist? */
+       err = scif_query_window(&req);
+       if (err) {
+               dev_err(&ep->remote_dev->sdev->dev,
+                       "%s %d err %d\n", __func__, __LINE__, err);
+               goto error_unlock;
+       }
+
+       /* Default prot for loopback */
+       if (!scifdev_self(ep->remote_dev))
+               vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+       /*
+        * VM_DONTCOPY - Do not copy this vma on fork
+        * VM_DONTEXPAND - Cannot expand with mremap()
+        * VM_RESERVED - Count as reserved_vm like IO
+        * VM_PFNMAP - Page-ranges managed without "struct page"
+        * VM_IO - Memory mapped I/O or similar
+        *
+        * We do not want to copy this VMA automatically on a fork(),
+        * expand this VMA due to mremap() or swap out these pages since
+        * the VMA is actually backed by physical pages in the remote
+        * node's physical memory and not via a struct page.
+        */
+       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+
+       if (!scifdev_self(ep->remote_dev))
+               vma->vm_flags |= VM_IO | VM_PFNMAP;
+
+       /* Map this range of windows */
+       err = scif_rma_list_mmap(window, start_offset, nr_pages, vma);
+       if (err) {
+               dev_err(&ep->remote_dev->sdev->dev,
+                       "%s %d err %d\n", __func__, __LINE__, err);
+               goto error_unlock;
+       }
+       /* Set up the driver call back */
+       vma->vm_ops = &scif_vm_ops;
+       vma->vm_private_data = vmapvt;
+error_unlock:
+       mutex_unlock(&ep->rma_info.rma_lock);
+       if (err) {
+               kfree(vmapvt);
+               dev_err(&ep->remote_dev->sdev->dev,
+                       "%s %d err %d\n", __func__, __LINE__, err);
+               scif_delete_vma(ep, vma);
+       }
+       return err;
+}