9cb88f373c7a669578366aa9deec6c51edea1f59
[firefly-linux-kernel-4.4.55.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/interrupt.h>
51 #include <linux/pci.h>  /* for Tavor hack below */
52 #include <linux/slab.h>
53
54 #include "xprt_rdma.h"
55
56 /*
57  * Globals/Macros
58  */
59
60 #ifdef RPC_DEBUG
61 # define RPCDBG_FACILITY        RPCDBG_TRANS
62 #endif
63
64 /*
65  * internal functions
66  */
67
68 /*
69  * handle replies in tasklet context, using a single, global list
70  * rdma tasklet function -- just turn around and call the func
71  * for all replies on the list
72  */
73
74 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75 static LIST_HEAD(rpcrdma_tasklets_g);
76
77 static void
78 rpcrdma_run_tasklet(unsigned long data)
79 {
80         struct rpcrdma_rep *rep;
81         void (*func)(struct rpcrdma_rep *);
82         unsigned long flags;
83
84         data = data;
85         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86         while (!list_empty(&rpcrdma_tasklets_g)) {
87                 rep = list_entry(rpcrdma_tasklets_g.next,
88                                  struct rpcrdma_rep, rr_list);
89                 list_del(&rep->rr_list);
90                 func = rep->rr_func;
91                 rep->rr_func = NULL;
92                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94                 if (func)
95                         func(rep);
96                 else
97                         rpcrdma_recv_buffer_put(rep);
98
99                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100         }
101         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102 }
103
104 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106 static inline void
107 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108 {
109         unsigned long flags;
110
111         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112         list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114         tasklet_schedule(&rpcrdma_tasklet_g);
115 }
116
117 static void
118 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119 {
120         struct rpcrdma_ep *ep = context;
121
122         dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
123                 __func__, event->event, event->device->name, context);
124         if (ep->rep_connected == 1) {
125                 ep->rep_connected = -EIO;
126                 ep->rep_func(ep);
127                 wake_up_all(&ep->rep_connect_wait);
128         }
129 }
130
131 static void
132 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133 {
134         struct rpcrdma_ep *ep = context;
135
136         dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
137                 __func__, event->event, event->device->name, context);
138         if (ep->rep_connected == 1) {
139                 ep->rep_connected = -EIO;
140                 ep->rep_func(ep);
141                 wake_up_all(&ep->rep_connect_wait);
142         }
143 }
144
145 static inline
146 void rpcrdma_event_process(struct ib_wc *wc)
147 {
148         struct rpcrdma_mw *frmr;
149         struct rpcrdma_rep *rep =
150                         (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151
152         dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
153                 __func__, rep, wc->status, wc->opcode, wc->byte_len);
154
155         if (!rep) /* send or bind completion that we don't care about */
156                 return;
157
158         if (IB_WC_SUCCESS != wc->status) {
159                 dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
160                         __func__, wc->opcode, wc->status);
161                 rep->rr_len = ~0U;
162                 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
163                         rpcrdma_schedule_tasklet(rep);
164                 return;
165         }
166
167         switch (wc->opcode) {
168         case IB_WC_FAST_REG_MR:
169                 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
170                 frmr->r.frmr.state = FRMR_IS_VALID;
171                 break;
172         case IB_WC_LOCAL_INV:
173                 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
174                 frmr->r.frmr.state = FRMR_IS_INVALID;
175                 break;
176         case IB_WC_RECV:
177                 rep->rr_len = wc->byte_len;
178                 ib_dma_sync_single_for_cpu(
179                         rdmab_to_ia(rep->rr_buffer)->ri_id->device,
180                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
181                 /* Keep (only) the most recent credits, after check validity */
182                 if (rep->rr_len >= 16) {
183                         struct rpcrdma_msg *p =
184                                         (struct rpcrdma_msg *) rep->rr_base;
185                         unsigned int credits = ntohl(p->rm_credit);
186                         if (credits == 0) {
187                                 dprintk("RPC:       %s: server"
188                                         " dropped credits to 0!\n", __func__);
189                                 /* don't deadlock */
190                                 credits = 1;
191                         } else if (credits > rep->rr_buffer->rb_max_requests) {
192                                 dprintk("RPC:       %s: server"
193                                         " over-crediting: %d (%d)\n",
194                                         __func__, credits,
195                                         rep->rr_buffer->rb_max_requests);
196                                 credits = rep->rr_buffer->rb_max_requests;
197                         }
198                         atomic_set(&rep->rr_buffer->rb_credits, credits);
199                 }
200                 /* fall through */
201         case IB_WC_BIND_MW:
202                 rpcrdma_schedule_tasklet(rep);
203                 break;
204         default:
205                 dprintk("RPC:       %s: unexpected WC event %X\n",
206                         __func__, wc->opcode);
207                 break;
208         }
209 }
210
211 static inline int
212 rpcrdma_cq_poll(struct ib_cq *cq)
213 {
214         struct ib_wc wc;
215         int rc;
216
217         for (;;) {
218                 rc = ib_poll_cq(cq, 1, &wc);
219                 if (rc < 0) {
220                         dprintk("RPC:       %s: ib_poll_cq failed %i\n",
221                                 __func__, rc);
222                         return rc;
223                 }
224                 if (rc == 0)
225                         break;
226
227                 rpcrdma_event_process(&wc);
228         }
229
230         return 0;
231 }
232
233 /*
234  * rpcrdma_cq_event_upcall
235  *
236  * This upcall handles recv, send, bind and unbind events.
237  * It is reentrant but processes single events in order to maintain
238  * ordering of receives to keep server credits.
239  *
240  * It is the responsibility of the scheduled tasklet to return
241  * recv buffers to the pool. NOTE: this affects synchronization of
242  * connection shutdown. That is, the structures required for
243  * the completion of the reply handler must remain intact until
244  * all memory has been reclaimed.
245  *
246  * Note that send events are suppressed and do not result in an upcall.
247  */
248 static void
249 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
250 {
251         int rc;
252
253         rc = rpcrdma_cq_poll(cq);
254         if (rc)
255                 return;
256
257         rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
258         if (rc) {
259                 dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
260                         __func__, rc);
261                 return;
262         }
263
264         rpcrdma_cq_poll(cq);
265 }
266
267 #ifdef RPC_DEBUG
268 static const char * const conn[] = {
269         "address resolved",
270         "address error",
271         "route resolved",
272         "route error",
273         "connect request",
274         "connect response",
275         "connect error",
276         "unreachable",
277         "rejected",
278         "established",
279         "disconnected",
280         "device removal"
281 };
282 #endif
283
284 static int
285 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286 {
287         struct rpcrdma_xprt *xprt = id->context;
288         struct rpcrdma_ia *ia = &xprt->rx_ia;
289         struct rpcrdma_ep *ep = &xprt->rx_ep;
290 #ifdef RPC_DEBUG
291         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
292 #endif
293         struct ib_qp_attr attr;
294         struct ib_qp_init_attr iattr;
295         int connstate = 0;
296
297         switch (event->event) {
298         case RDMA_CM_EVENT_ADDR_RESOLVED:
299         case RDMA_CM_EVENT_ROUTE_RESOLVED:
300                 ia->ri_async_rc = 0;
301                 complete(&ia->ri_done);
302                 break;
303         case RDMA_CM_EVENT_ADDR_ERROR:
304                 ia->ri_async_rc = -EHOSTUNREACH;
305                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
306                         __func__, ep);
307                 complete(&ia->ri_done);
308                 break;
309         case RDMA_CM_EVENT_ROUTE_ERROR:
310                 ia->ri_async_rc = -ENETUNREACH;
311                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
312                         __func__, ep);
313                 complete(&ia->ri_done);
314                 break;
315         case RDMA_CM_EVENT_ESTABLISHED:
316                 connstate = 1;
317                 ib_query_qp(ia->ri_id->qp, &attr,
318                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
319                         &iattr);
320                 dprintk("RPC:       %s: %d responder resources"
321                         " (%d initiator)\n",
322                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
323                 goto connected;
324         case RDMA_CM_EVENT_CONNECT_ERROR:
325                 connstate = -ENOTCONN;
326                 goto connected;
327         case RDMA_CM_EVENT_UNREACHABLE:
328                 connstate = -ENETDOWN;
329                 goto connected;
330         case RDMA_CM_EVENT_REJECTED:
331                 connstate = -ECONNREFUSED;
332                 goto connected;
333         case RDMA_CM_EVENT_DISCONNECTED:
334                 connstate = -ECONNABORTED;
335                 goto connected;
336         case RDMA_CM_EVENT_DEVICE_REMOVAL:
337                 connstate = -ENODEV;
338 connected:
339                 dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
340                         __func__,
341                         (event->event <= 11) ? conn[event->event] :
342                                                 "unknown connection error",
343                         &addr->sin_addr.s_addr,
344                         ntohs(addr->sin_port),
345                         ep, event->event);
346                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
347                 dprintk("RPC:       %s: %sconnected\n",
348                                         __func__, connstate > 0 ? "" : "dis");
349                 ep->rep_connected = connstate;
350                 ep->rep_func(ep);
351                 wake_up_all(&ep->rep_connect_wait);
352                 break;
353         default:
354                 dprintk("RPC:       %s: unexpected CM event %d\n",
355                         __func__, event->event);
356                 break;
357         }
358
359 #ifdef RPC_DEBUG
360         if (connstate == 1) {
361                 int ird = attr.max_dest_rd_atomic;
362                 int tird = ep->rep_remote_cma.responder_resources;
363                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
364                         "on %s, memreg %d slots %d ird %d%s\n",
365                         &addr->sin_addr.s_addr,
366                         ntohs(addr->sin_port),
367                         ia->ri_id->device->name,
368                         ia->ri_memreg_strategy,
369                         xprt->rx_buf.rb_max_requests,
370                         ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
371         } else if (connstate < 0) {
372                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
373                         &addr->sin_addr.s_addr,
374                         ntohs(addr->sin_port),
375                         connstate);
376         }
377 #endif
378
379         return 0;
380 }
381
382 static struct rdma_cm_id *
383 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
384                         struct rpcrdma_ia *ia, struct sockaddr *addr)
385 {
386         struct rdma_cm_id *id;
387         int rc;
388
389         init_completion(&ia->ri_done);
390
391         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
392         if (IS_ERR(id)) {
393                 rc = PTR_ERR(id);
394                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
395                         __func__, rc);
396                 return id;
397         }
398
399         ia->ri_async_rc = -ETIMEDOUT;
400         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
401         if (rc) {
402                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
403                         __func__, rc);
404                 goto out;
405         }
406         wait_for_completion_interruptible_timeout(&ia->ri_done,
407                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
408         rc = ia->ri_async_rc;
409         if (rc)
410                 goto out;
411
412         ia->ri_async_rc = -ETIMEDOUT;
413         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
414         if (rc) {
415                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
416                         __func__, rc);
417                 goto out;
418         }
419         wait_for_completion_interruptible_timeout(&ia->ri_done,
420                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
421         rc = ia->ri_async_rc;
422         if (rc)
423                 goto out;
424
425         return id;
426
427 out:
428         rdma_destroy_id(id);
429         return ERR_PTR(rc);
430 }
431
432 /*
433  * Drain any cq, prior to teardown.
434  */
435 static void
436 rpcrdma_clean_cq(struct ib_cq *cq)
437 {
438         struct ib_wc wc;
439         int count = 0;
440
441         while (1 == ib_poll_cq(cq, 1, &wc))
442                 ++count;
443
444         if (count)
445                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
446                         __func__, count, wc.opcode);
447 }
448
449 /*
450  * Exported functions.
451  */
452
453 /*
454  * Open and initialize an Interface Adapter.
455  *  o initializes fields of struct rpcrdma_ia, including
456  *    interface and provider attributes and protection zone.
457  */
458 int
459 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460 {
461         int rc, mem_priv;
462         struct ib_device_attr devattr;
463         struct rpcrdma_ia *ia = &xprt->rx_ia;
464
465         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
466         if (IS_ERR(ia->ri_id)) {
467                 rc = PTR_ERR(ia->ri_id);
468                 goto out1;
469         }
470
471         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
472         if (IS_ERR(ia->ri_pd)) {
473                 rc = PTR_ERR(ia->ri_pd);
474                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
475                         __func__, rc);
476                 goto out2;
477         }
478
479         /*
480          * Query the device to determine if the requested memory
481          * registration strategy is supported. If it isn't, set the
482          * strategy to a globally supported model.
483          */
484         rc = ib_query_device(ia->ri_id->device, &devattr);
485         if (rc) {
486                 dprintk("RPC:       %s: ib_query_device failed %d\n",
487                         __func__, rc);
488                 goto out2;
489         }
490
491         if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
492                 ia->ri_have_dma_lkey = 1;
493                 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494         }
495
496         switch (memreg) {
497         case RPCRDMA_MEMWINDOWS:
498         case RPCRDMA_MEMWINDOWS_ASYNC:
499                 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
500                         dprintk("RPC:       %s: MEMWINDOWS registration "
501                                 "specified but not supported by adapter, "
502                                 "using slower RPCRDMA_REGISTER\n",
503                                 __func__);
504                         memreg = RPCRDMA_REGISTER;
505                 }
506                 break;
507         case RPCRDMA_MTHCAFMR:
508                 if (!ia->ri_id->device->alloc_fmr) {
509 #if RPCRDMA_PERSISTENT_REGISTRATION
510                         dprintk("RPC:       %s: MTHCAFMR registration "
511                                 "specified but not supported by adapter, "
512                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
513                                 __func__);
514                         memreg = RPCRDMA_ALLPHYSICAL;
515 #else
516                         dprintk("RPC:       %s: MTHCAFMR registration "
517                                 "specified but not supported by adapter, "
518                                 "using slower RPCRDMA_REGISTER\n",
519                                 __func__);
520                         memreg = RPCRDMA_REGISTER;
521 #endif
522                 }
523                 break;
524         case RPCRDMA_FRMR:
525                 /* Requires both frmr reg and local dma lkey */
526                 if ((devattr.device_cap_flags &
527                      (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
528                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
529 #if RPCRDMA_PERSISTENT_REGISTRATION
530                         dprintk("RPC:       %s: FRMR registration "
531                                 "specified but not supported by adapter, "
532                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
533                                 __func__);
534                         memreg = RPCRDMA_ALLPHYSICAL;
535 #else
536                         dprintk("RPC:       %s: FRMR registration "
537                                 "specified but not supported by adapter, "
538                                 "using slower RPCRDMA_REGISTER\n",
539                                 __func__);
540                         memreg = RPCRDMA_REGISTER;
541 #endif
542                 } else {
543                         /* Mind the ia limit on FRMR page list depth */
544                         ia->ri_max_frmr_depth = min_t(unsigned int,
545                                 RPCRDMA_MAX_DATA_SEGS,
546                                 devattr.max_fast_reg_page_list_len);
547                 }
548                 break;
549         }
550
551         /*
552          * Optionally obtain an underlying physical identity mapping in
553          * order to do a memory window-based bind. This base registration
554          * is protected from remote access - that is enabled only by binding
555          * for the specific bytes targeted during each RPC operation, and
556          * revoked after the corresponding completion similar to a storage
557          * adapter.
558          */
559         switch (memreg) {
560         case RPCRDMA_BOUNCEBUFFERS:
561         case RPCRDMA_REGISTER:
562         case RPCRDMA_FRMR:
563                 break;
564 #if RPCRDMA_PERSISTENT_REGISTRATION
565         case RPCRDMA_ALLPHYSICAL:
566                 mem_priv = IB_ACCESS_LOCAL_WRITE |
567                                 IB_ACCESS_REMOTE_WRITE |
568                                 IB_ACCESS_REMOTE_READ;
569                 goto register_setup;
570 #endif
571         case RPCRDMA_MEMWINDOWS_ASYNC:
572         case RPCRDMA_MEMWINDOWS:
573                 mem_priv = IB_ACCESS_LOCAL_WRITE |
574                                 IB_ACCESS_MW_BIND;
575                 goto register_setup;
576         case RPCRDMA_MTHCAFMR:
577                 if (ia->ri_have_dma_lkey)
578                         break;
579                 mem_priv = IB_ACCESS_LOCAL_WRITE;
580         register_setup:
581                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
582                 if (IS_ERR(ia->ri_bind_mem)) {
583                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
584                                 "phys register failed with %lX\n\t"
585                                 "Will continue with degraded performance\n",
586                                 __func__, PTR_ERR(ia->ri_bind_mem));
587                         memreg = RPCRDMA_REGISTER;
588                         ia->ri_bind_mem = NULL;
589                 }
590                 break;
591         default:
592                 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
593                                 __func__, memreg);
594                 rc = -EINVAL;
595                 goto out2;
596         }
597         dprintk("RPC:       %s: memory registration strategy is %d\n",
598                 __func__, memreg);
599
600         /* Else will do memory reg/dereg for each chunk */
601         ia->ri_memreg_strategy = memreg;
602
603         return 0;
604 out2:
605         rdma_destroy_id(ia->ri_id);
606         ia->ri_id = NULL;
607 out1:
608         return rc;
609 }
610
611 /*
612  * Clean up/close an IA.
613  *   o if event handles and PD have been initialized, free them.
614  *   o close the IA
615  */
616 void
617 rpcrdma_ia_close(struct rpcrdma_ia *ia)
618 {
619         int rc;
620
621         dprintk("RPC:       %s: entering\n", __func__);
622         if (ia->ri_bind_mem != NULL) {
623                 rc = ib_dereg_mr(ia->ri_bind_mem);
624                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
625                         __func__, rc);
626         }
627         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
628                 if (ia->ri_id->qp)
629                         rdma_destroy_qp(ia->ri_id);
630                 rdma_destroy_id(ia->ri_id);
631                 ia->ri_id = NULL;
632         }
633         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
634                 rc = ib_dealloc_pd(ia->ri_pd);
635                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
636                         __func__, rc);
637         }
638 }
639
640 /*
641  * Create unconnected endpoint.
642  */
643 int
644 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
645                                 struct rpcrdma_create_data_internal *cdata)
646 {
647         struct ib_device_attr devattr;
648         int rc, err;
649
650         rc = ib_query_device(ia->ri_id->device, &devattr);
651         if (rc) {
652                 dprintk("RPC:       %s: ib_query_device failed %d\n",
653                         __func__, rc);
654                 return rc;
655         }
656
657         /* check provider's send/recv wr limits */
658         if (cdata->max_requests > devattr.max_qp_wr)
659                 cdata->max_requests = devattr.max_qp_wr;
660
661         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
662         ep->rep_attr.qp_context = ep;
663         /* send_cq and recv_cq initialized below */
664         ep->rep_attr.srq = NULL;
665         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
666         switch (ia->ri_memreg_strategy) {
667         case RPCRDMA_FRMR: {
668                 int depth = 7;
669
670                 /* Add room for frmr register and invalidate WRs.
671                  * 1. FRMR reg WR for head
672                  * 2. FRMR invalidate WR for head
673                  * 3. N FRMR reg WRs for pagelist
674                  * 4. N FRMR invalidate WRs for pagelist
675                  * 5. FRMR reg WR for tail
676                  * 6. FRMR invalidate WR for tail
677                  * 7. The RDMA_SEND WR
678                  */
679
680                 /* Calculate N if the device max FRMR depth is smaller than
681                  * RPCRDMA_MAX_DATA_SEGS.
682                  */
683                 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
684                         int delta = RPCRDMA_MAX_DATA_SEGS -
685                                     ia->ri_max_frmr_depth;
686
687                         do {
688                                 depth += 2; /* FRMR reg + invalidate */
689                                 delta -= ia->ri_max_frmr_depth;
690                         } while (delta > 0);
691
692                 }
693                 ep->rep_attr.cap.max_send_wr *= depth;
694                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
695                         cdata->max_requests = devattr.max_qp_wr / depth;
696                         if (!cdata->max_requests)
697                                 return -EINVAL;
698                         ep->rep_attr.cap.max_send_wr = cdata->max_requests *
699                                                        depth;
700                 }
701                 break;
702         }
703         case RPCRDMA_MEMWINDOWS_ASYNC:
704         case RPCRDMA_MEMWINDOWS:
705                 /* Add room for mw_binds+unbinds - overkill! */
706                 ep->rep_attr.cap.max_send_wr++;
707                 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
708                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
709                         return -EINVAL;
710                 break;
711         default:
712                 break;
713         }
714         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
715         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
716         ep->rep_attr.cap.max_recv_sge = 1;
717         ep->rep_attr.cap.max_inline_data = 0;
718         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
719         ep->rep_attr.qp_type = IB_QPT_RC;
720         ep->rep_attr.port_num = ~0;
721
722         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
723                 "iovs: send %d recv %d\n",
724                 __func__,
725                 ep->rep_attr.cap.max_send_wr,
726                 ep->rep_attr.cap.max_recv_wr,
727                 ep->rep_attr.cap.max_send_sge,
728                 ep->rep_attr.cap.max_recv_sge);
729
730         /* set trigger for requesting send completion */
731         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
732         switch (ia->ri_memreg_strategy) {
733         case RPCRDMA_MEMWINDOWS_ASYNC:
734         case RPCRDMA_MEMWINDOWS:
735                 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
736                 break;
737         default:
738                 break;
739         }
740         if (ep->rep_cqinit <= 2)
741                 ep->rep_cqinit = 0;
742         INIT_CQCOUNT(ep);
743         ep->rep_ia = ia;
744         init_waitqueue_head(&ep->rep_connect_wait);
745         INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
746
747         /*
748          * Create a single cq for receive dto and mw_bind (only ever
749          * care about unbind, really). Send completions are suppressed.
750          * Use single threaded tasklet upcalls to maintain ordering.
751          */
752         ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
753                                   rpcrdma_cq_async_error_upcall, NULL,
754                                   ep->rep_attr.cap.max_recv_wr +
755                                   ep->rep_attr.cap.max_send_wr + 1, 0);
756         if (IS_ERR(ep->rep_cq)) {
757                 rc = PTR_ERR(ep->rep_cq);
758                 dprintk("RPC:       %s: ib_create_cq failed: %i\n",
759                         __func__, rc);
760                 goto out1;
761         }
762
763         rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
764         if (rc) {
765                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
766                         __func__, rc);
767                 goto out2;
768         }
769
770         ep->rep_attr.send_cq = ep->rep_cq;
771         ep->rep_attr.recv_cq = ep->rep_cq;
772
773         /* Initialize cma parameters */
774
775         /* RPC/RDMA does not use private data */
776         ep->rep_remote_cma.private_data = NULL;
777         ep->rep_remote_cma.private_data_len = 0;
778
779         /* Client offers RDMA Read but does not initiate */
780         ep->rep_remote_cma.initiator_depth = 0;
781         if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
782                 ep->rep_remote_cma.responder_resources = 0;
783         else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
784                 ep->rep_remote_cma.responder_resources = 32;
785         else
786                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
787
788         ep->rep_remote_cma.retry_count = 7;
789         ep->rep_remote_cma.flow_control = 0;
790         ep->rep_remote_cma.rnr_retry_count = 0;
791
792         return 0;
793
794 out2:
795         err = ib_destroy_cq(ep->rep_cq);
796         if (err)
797                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
798                         __func__, err);
799 out1:
800         return rc;
801 }
802
803 /*
804  * rpcrdma_ep_destroy
805  *
806  * Disconnect and destroy endpoint. After this, the only
807  * valid operations on the ep are to free it (if dynamically
808  * allocated) or re-create it.
809  *
810  * The caller's error handling must be sure to not leak the endpoint
811  * if this function fails.
812  */
813 int
814 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
815 {
816         int rc;
817
818         dprintk("RPC:       %s: entering, connected is %d\n",
819                 __func__, ep->rep_connected);
820
821         cancel_delayed_work_sync(&ep->rep_connect_worker);
822
823         if (ia->ri_id->qp) {
824                 rc = rpcrdma_ep_disconnect(ep, ia);
825                 if (rc)
826                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
827                                 " returned %i\n", __func__, rc);
828                 rdma_destroy_qp(ia->ri_id);
829                 ia->ri_id->qp = NULL;
830         }
831
832         /* padding - could be done in rpcrdma_buffer_destroy... */
833         if (ep->rep_pad_mr) {
834                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
835                 ep->rep_pad_mr = NULL;
836         }
837
838         rpcrdma_clean_cq(ep->rep_cq);
839         rc = ib_destroy_cq(ep->rep_cq);
840         if (rc)
841                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
842                         __func__, rc);
843
844         return rc;
845 }
846
847 /*
848  * Connect unconnected endpoint.
849  */
850 int
851 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
852 {
853         struct rdma_cm_id *id;
854         int rc = 0;
855         int retry_count = 0;
856
857         if (ep->rep_connected != 0) {
858                 struct rpcrdma_xprt *xprt;
859 retry:
860                 rc = rpcrdma_ep_disconnect(ep, ia);
861                 if (rc && rc != -ENOTCONN)
862                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
863                                 " status %i\n", __func__, rc);
864                 rpcrdma_clean_cq(ep->rep_cq);
865
866                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
867                 id = rpcrdma_create_id(xprt, ia,
868                                 (struct sockaddr *)&xprt->rx_data.addr);
869                 if (IS_ERR(id)) {
870                         rc = PTR_ERR(id);
871                         goto out;
872                 }
873                 /* TEMP TEMP TEMP - fail if new device:
874                  * Deregister/remarshal *all* requests!
875                  * Close and recreate adapter, pd, etc!
876                  * Re-determine all attributes still sane!
877                  * More stuff I haven't thought of!
878                  * Rrrgh!
879                  */
880                 if (ia->ri_id->device != id->device) {
881                         printk("RPC:       %s: can't reconnect on "
882                                 "different device!\n", __func__);
883                         rdma_destroy_id(id);
884                         rc = -ENETDOWN;
885                         goto out;
886                 }
887                 /* END TEMP */
888                 rdma_destroy_qp(ia->ri_id);
889                 rdma_destroy_id(ia->ri_id);
890                 ia->ri_id = id;
891         }
892
893         rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
894         if (rc) {
895                 dprintk("RPC:       %s: rdma_create_qp failed %i\n",
896                         __func__, rc);
897                 goto out;
898         }
899
900 /* XXX Tavor device performs badly with 2K MTU! */
901 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
902         struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
903         if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
904             (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
905              pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
906                 struct ib_qp_attr attr = {
907                         .path_mtu = IB_MTU_1024
908                 };
909                 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
910         }
911 }
912
913         ep->rep_connected = 0;
914
915         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
916         if (rc) {
917                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
918                                 __func__, rc);
919                 goto out;
920         }
921
922         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
923
924         /*
925          * Check state. A non-peer reject indicates no listener
926          * (ECONNREFUSED), which may be a transient state. All
927          * others indicate a transport condition which has already
928          * undergone a best-effort.
929          */
930         if (ep->rep_connected == -ECONNREFUSED &&
931             ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
932                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
933                 goto retry;
934         }
935         if (ep->rep_connected <= 0) {
936                 /* Sometimes, the only way to reliably connect to remote
937                  * CMs is to use same nonzero values for ORD and IRD. */
938                 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
939                     (ep->rep_remote_cma.responder_resources == 0 ||
940                      ep->rep_remote_cma.initiator_depth !=
941                                 ep->rep_remote_cma.responder_resources)) {
942                         if (ep->rep_remote_cma.responder_resources == 0)
943                                 ep->rep_remote_cma.responder_resources = 1;
944                         ep->rep_remote_cma.initiator_depth =
945                                 ep->rep_remote_cma.responder_resources;
946                         goto retry;
947                 }
948                 rc = ep->rep_connected;
949         } else {
950                 dprintk("RPC:       %s: connected\n", __func__);
951         }
952
953 out:
954         if (rc)
955                 ep->rep_connected = rc;
956         return rc;
957 }
958
959 /*
960  * rpcrdma_ep_disconnect
961  *
962  * This is separate from destroy to facilitate the ability
963  * to reconnect without recreating the endpoint.
964  *
965  * This call is not reentrant, and must not be made in parallel
966  * on the same endpoint.
967  */
968 int
969 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
970 {
971         int rc;
972
973         rpcrdma_clean_cq(ep->rep_cq);
974         rc = rdma_disconnect(ia->ri_id);
975         if (!rc) {
976                 /* returns without wait if not connected */
977                 wait_event_interruptible(ep->rep_connect_wait,
978                                                         ep->rep_connected != 1);
979                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
980                         (ep->rep_connected == 1) ? "still " : "dis");
981         } else {
982                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
983                 ep->rep_connected = rc;
984         }
985         return rc;
986 }
987
988 /*
989  * Initialize buffer memory
990  */
991 int
992 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
993         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
994 {
995         char *p;
996         size_t len;
997         int i, rc;
998         struct rpcrdma_mw *r;
999
1000         buf->rb_max_requests = cdata->max_requests;
1001         spin_lock_init(&buf->rb_lock);
1002         atomic_set(&buf->rb_credits, 1);
1003
1004         /* Need to allocate:
1005          *   1.  arrays for send and recv pointers
1006          *   2.  arrays of struct rpcrdma_req to fill in pointers
1007          *   3.  array of struct rpcrdma_rep for replies
1008          *   4.  padding, if any
1009          *   5.  mw's, fmr's or frmr's, if any
1010          * Send/recv buffers in req/rep need to be registered
1011          */
1012
1013         len = buf->rb_max_requests *
1014                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1015         len += cdata->padding;
1016         switch (ia->ri_memreg_strategy) {
1017         case RPCRDMA_FRMR:
1018                 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
1019                                 sizeof(struct rpcrdma_mw);
1020                 break;
1021         case RPCRDMA_MTHCAFMR:
1022                 /* TBD we are perhaps overallocating here */
1023                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1024                                 sizeof(struct rpcrdma_mw);
1025                 break;
1026         case RPCRDMA_MEMWINDOWS_ASYNC:
1027         case RPCRDMA_MEMWINDOWS:
1028                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1029                                 sizeof(struct rpcrdma_mw);
1030                 break;
1031         default:
1032                 break;
1033         }
1034
1035         /* allocate 1, 4 and 5 in one shot */
1036         p = kzalloc(len, GFP_KERNEL);
1037         if (p == NULL) {
1038                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1039                         __func__, len);
1040                 rc = -ENOMEM;
1041                 goto out;
1042         }
1043         buf->rb_pool = p;       /* for freeing it later */
1044
1045         buf->rb_send_bufs = (struct rpcrdma_req **) p;
1046         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1047         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1048         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1049
1050         /*
1051          * Register the zeroed pad buffer, if any.
1052          */
1053         if (cdata->padding) {
1054                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1055                                             &ep->rep_pad_mr, &ep->rep_pad);
1056                 if (rc)
1057                         goto out;
1058         }
1059         p += cdata->padding;
1060
1061         /*
1062          * Allocate the fmr's, or mw's for mw_bind chunk registration.
1063          * We "cycle" the mw's in order to minimize rkey reuse,
1064          * and also reduce unbind-to-bind collision.
1065          */
1066         INIT_LIST_HEAD(&buf->rb_mws);
1067         r = (struct rpcrdma_mw *)p;
1068         switch (ia->ri_memreg_strategy) {
1069         case RPCRDMA_FRMR:
1070                 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1071                         r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1072                                                 ia->ri_max_frmr_depth);
1073                         if (IS_ERR(r->r.frmr.fr_mr)) {
1074                                 rc = PTR_ERR(r->r.frmr.fr_mr);
1075                                 dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1076                                         " failed %i\n", __func__, rc);
1077                                 goto out;
1078                         }
1079                         r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1080                                                 ia->ri_id->device,
1081                                                 ia->ri_max_frmr_depth);
1082                         if (IS_ERR(r->r.frmr.fr_pgl)) {
1083                                 rc = PTR_ERR(r->r.frmr.fr_pgl);
1084                                 dprintk("RPC:       %s: "
1085                                         "ib_alloc_fast_reg_page_list "
1086                                         "failed %i\n", __func__, rc);
1087
1088                                 ib_dereg_mr(r->r.frmr.fr_mr);
1089                                 goto out;
1090                         }
1091                         list_add(&r->mw_list, &buf->rb_mws);
1092                         ++r;
1093                 }
1094                 break;
1095         case RPCRDMA_MTHCAFMR:
1096                 /* TBD we are perhaps overallocating here */
1097                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1098                         static struct ib_fmr_attr fa =
1099                                 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1100                         r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1101                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1102                                 &fa);
1103                         if (IS_ERR(r->r.fmr)) {
1104                                 rc = PTR_ERR(r->r.fmr);
1105                                 dprintk("RPC:       %s: ib_alloc_fmr"
1106                                         " failed %i\n", __func__, rc);
1107                                 goto out;
1108                         }
1109                         list_add(&r->mw_list, &buf->rb_mws);
1110                         ++r;
1111                 }
1112                 break;
1113         case RPCRDMA_MEMWINDOWS_ASYNC:
1114         case RPCRDMA_MEMWINDOWS:
1115                 /* Allocate one extra request's worth, for full cycling */
1116                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1117                         r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1);
1118                         if (IS_ERR(r->r.mw)) {
1119                                 rc = PTR_ERR(r->r.mw);
1120                                 dprintk("RPC:       %s: ib_alloc_mw"
1121                                         " failed %i\n", __func__, rc);
1122                                 goto out;
1123                         }
1124                         list_add(&r->mw_list, &buf->rb_mws);
1125                         ++r;
1126                 }
1127                 break;
1128         default:
1129                 break;
1130         }
1131
1132         /*
1133          * Allocate/init the request/reply buffers. Doing this
1134          * using kmalloc for now -- one for each buf.
1135          */
1136         for (i = 0; i < buf->rb_max_requests; i++) {
1137                 struct rpcrdma_req *req;
1138                 struct rpcrdma_rep *rep;
1139
1140                 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1141                 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1142                 /* Typical ~2400b, so rounding up saves work later */
1143                 if (len < 4096)
1144                         len = 4096;
1145                 req = kmalloc(len, GFP_KERNEL);
1146                 if (req == NULL) {
1147                         dprintk("RPC:       %s: request buffer %d alloc"
1148                                 " failed\n", __func__, i);
1149                         rc = -ENOMEM;
1150                         goto out;
1151                 }
1152                 memset(req, 0, sizeof(struct rpcrdma_req));
1153                 buf->rb_send_bufs[i] = req;
1154                 buf->rb_send_bufs[i]->rl_buffer = buf;
1155
1156                 rc = rpcrdma_register_internal(ia, req->rl_base,
1157                                 len - offsetof(struct rpcrdma_req, rl_base),
1158                                 &buf->rb_send_bufs[i]->rl_handle,
1159                                 &buf->rb_send_bufs[i]->rl_iov);
1160                 if (rc)
1161                         goto out;
1162
1163                 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1164
1165                 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1166                 rep = kmalloc(len, GFP_KERNEL);
1167                 if (rep == NULL) {
1168                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1169                                 __func__, i);
1170                         rc = -ENOMEM;
1171                         goto out;
1172                 }
1173                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1174                 buf->rb_recv_bufs[i] = rep;
1175                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1176                 init_waitqueue_head(&rep->rr_unbind);
1177
1178                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1179                                 len - offsetof(struct rpcrdma_rep, rr_base),
1180                                 &buf->rb_recv_bufs[i]->rr_handle,
1181                                 &buf->rb_recv_bufs[i]->rr_iov);
1182                 if (rc)
1183                         goto out;
1184
1185         }
1186         dprintk("RPC:       %s: max_requests %d\n",
1187                 __func__, buf->rb_max_requests);
1188         /* done */
1189         return 0;
1190 out:
1191         rpcrdma_buffer_destroy(buf);
1192         return rc;
1193 }
1194
1195 /*
1196  * Unregister and destroy buffer memory. Need to deal with
1197  * partial initialization, so it's callable from failed create.
1198  * Must be called before destroying endpoint, as registrations
1199  * reference it.
1200  */
1201 void
1202 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1203 {
1204         int rc, i;
1205         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1206         struct rpcrdma_mw *r;
1207
1208         /* clean up in reverse order from create
1209          *   1.  recv mr memory (mr free, then kfree)
1210          *   1a. bind mw memory
1211          *   2.  send mr memory (mr free, then kfree)
1212          *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1213          *   4.  arrays
1214          */
1215         dprintk("RPC:       %s: entering\n", __func__);
1216
1217         for (i = 0; i < buf->rb_max_requests; i++) {
1218                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1219                         rpcrdma_deregister_internal(ia,
1220                                         buf->rb_recv_bufs[i]->rr_handle,
1221                                         &buf->rb_recv_bufs[i]->rr_iov);
1222                         kfree(buf->rb_recv_bufs[i]);
1223                 }
1224                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1225                         rpcrdma_deregister_internal(ia,
1226                                         buf->rb_send_bufs[i]->rl_handle,
1227                                         &buf->rb_send_bufs[i]->rl_iov);
1228                         kfree(buf->rb_send_bufs[i]);
1229                 }
1230         }
1231
1232         while (!list_empty(&buf->rb_mws)) {
1233                 r = list_entry(buf->rb_mws.next,
1234                         struct rpcrdma_mw, mw_list);
1235                 list_del(&r->mw_list);
1236                 switch (ia->ri_memreg_strategy) {
1237                 case RPCRDMA_FRMR:
1238                         rc = ib_dereg_mr(r->r.frmr.fr_mr);
1239                         if (rc)
1240                                 dprintk("RPC:       %s:"
1241                                         " ib_dereg_mr"
1242                                         " failed %i\n",
1243                                         __func__, rc);
1244                         ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1245                         break;
1246                 case RPCRDMA_MTHCAFMR:
1247                         rc = ib_dealloc_fmr(r->r.fmr);
1248                         if (rc)
1249                                 dprintk("RPC:       %s:"
1250                                         " ib_dealloc_fmr"
1251                                         " failed %i\n",
1252                                         __func__, rc);
1253                         break;
1254                 case RPCRDMA_MEMWINDOWS_ASYNC:
1255                 case RPCRDMA_MEMWINDOWS:
1256                         rc = ib_dealloc_mw(r->r.mw);
1257                         if (rc)
1258                                 dprintk("RPC:       %s:"
1259                                         " ib_dealloc_mw"
1260                                         " failed %i\n",
1261                                         __func__, rc);
1262                         break;
1263                 default:
1264                         break;
1265                 }
1266         }
1267
1268         kfree(buf->rb_pool);
1269 }
1270
1271 /*
1272  * Get a set of request/reply buffers.
1273  *
1274  * Reply buffer (if needed) is attached to send buffer upon return.
1275  * Rule:
1276  *    rb_send_index and rb_recv_index MUST always be pointing to the
1277  *    *next* available buffer (non-NULL). They are incremented after
1278  *    removing buffers, and decremented *before* returning them.
1279  */
1280 struct rpcrdma_req *
1281 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1282 {
1283         struct rpcrdma_req *req;
1284         unsigned long flags;
1285         int i;
1286         struct rpcrdma_mw *r;
1287
1288         spin_lock_irqsave(&buffers->rb_lock, flags);
1289         if (buffers->rb_send_index == buffers->rb_max_requests) {
1290                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1291                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1292                 return ((struct rpcrdma_req *)NULL);
1293         }
1294
1295         req = buffers->rb_send_bufs[buffers->rb_send_index];
1296         if (buffers->rb_send_index < buffers->rb_recv_index) {
1297                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1298                         __func__,
1299                         buffers->rb_recv_index - buffers->rb_send_index);
1300                 req->rl_reply = NULL;
1301         } else {
1302                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1303                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1304         }
1305         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1306         if (!list_empty(&buffers->rb_mws)) {
1307                 i = RPCRDMA_MAX_SEGS - 1;
1308                 do {
1309                         r = list_entry(buffers->rb_mws.next,
1310                                         struct rpcrdma_mw, mw_list);
1311                         list_del(&r->mw_list);
1312                         req->rl_segments[i].mr_chunk.rl_mw = r;
1313                 } while (--i >= 0);
1314         }
1315         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1316         return req;
1317 }
1318
1319 /*
1320  * Put request/reply buffers back into pool.
1321  * Pre-decrement counter/array index.
1322  */
1323 void
1324 rpcrdma_buffer_put(struct rpcrdma_req *req)
1325 {
1326         struct rpcrdma_buffer *buffers = req->rl_buffer;
1327         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1328         int i;
1329         unsigned long flags;
1330
1331         BUG_ON(req->rl_nchunks != 0);
1332         spin_lock_irqsave(&buffers->rb_lock, flags);
1333         buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1334         req->rl_niovs = 0;
1335         if (req->rl_reply) {
1336                 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1337                 init_waitqueue_head(&req->rl_reply->rr_unbind);
1338                 req->rl_reply->rr_func = NULL;
1339                 req->rl_reply = NULL;
1340         }
1341         switch (ia->ri_memreg_strategy) {
1342         case RPCRDMA_FRMR:
1343         case RPCRDMA_MTHCAFMR:
1344         case RPCRDMA_MEMWINDOWS_ASYNC:
1345         case RPCRDMA_MEMWINDOWS:
1346                 /*
1347                  * Cycle mw's back in reverse order, and "spin" them.
1348                  * This delays and scrambles reuse as much as possible.
1349                  */
1350                 i = 1;
1351                 do {
1352                         struct rpcrdma_mw **mw;
1353                         mw = &req->rl_segments[i].mr_chunk.rl_mw;
1354                         list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1355                         *mw = NULL;
1356                 } while (++i < RPCRDMA_MAX_SEGS);
1357                 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1358                                         &buffers->rb_mws);
1359                 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1360                 break;
1361         default:
1362                 break;
1363         }
1364         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1365 }
1366
1367 /*
1368  * Recover reply buffers from pool.
1369  * This happens when recovering from error conditions.
1370  * Post-increment counter/array index.
1371  */
1372 void
1373 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1374 {
1375         struct rpcrdma_buffer *buffers = req->rl_buffer;
1376         unsigned long flags;
1377
1378         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1379                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1380         spin_lock_irqsave(&buffers->rb_lock, flags);
1381         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1382                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1383                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1384         }
1385         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1386 }
1387
1388 /*
1389  * Put reply buffers back into pool when not attached to
1390  * request. This happens in error conditions, and when
1391  * aborting unbinds. Pre-decrement counter/array index.
1392  */
1393 void
1394 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1395 {
1396         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1397         unsigned long flags;
1398
1399         rep->rr_func = NULL;
1400         spin_lock_irqsave(&buffers->rb_lock, flags);
1401         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1402         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1403 }
1404
1405 /*
1406  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1407  */
1408
1409 int
1410 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1411                                 struct ib_mr **mrp, struct ib_sge *iov)
1412 {
1413         struct ib_phys_buf ipb;
1414         struct ib_mr *mr;
1415         int rc;
1416
1417         /*
1418          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1419          */
1420         iov->addr = ib_dma_map_single(ia->ri_id->device,
1421                         va, len, DMA_BIDIRECTIONAL);
1422         iov->length = len;
1423
1424         if (ia->ri_have_dma_lkey) {
1425                 *mrp = NULL;
1426                 iov->lkey = ia->ri_dma_lkey;
1427                 return 0;
1428         } else if (ia->ri_bind_mem != NULL) {
1429                 *mrp = NULL;
1430                 iov->lkey = ia->ri_bind_mem->lkey;
1431                 return 0;
1432         }
1433
1434         ipb.addr = iov->addr;
1435         ipb.size = iov->length;
1436         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1437                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1438
1439         dprintk("RPC:       %s: phys convert: 0x%llx "
1440                         "registered 0x%llx length %d\n",
1441                         __func__, (unsigned long long)ipb.addr,
1442                         (unsigned long long)iov->addr, len);
1443
1444         if (IS_ERR(mr)) {
1445                 *mrp = NULL;
1446                 rc = PTR_ERR(mr);
1447                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1448         } else {
1449                 *mrp = mr;
1450                 iov->lkey = mr->lkey;
1451                 rc = 0;
1452         }
1453
1454         return rc;
1455 }
1456
1457 int
1458 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1459                                 struct ib_mr *mr, struct ib_sge *iov)
1460 {
1461         int rc;
1462
1463         ib_dma_unmap_single(ia->ri_id->device,
1464                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1465
1466         if (NULL == mr)
1467                 return 0;
1468
1469         rc = ib_dereg_mr(mr);
1470         if (rc)
1471                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1472         return rc;
1473 }
1474
1475 /*
1476  * Wrappers for chunk registration, shared by read/write chunk code.
1477  */
1478
1479 static void
1480 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1481 {
1482         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1483         seg->mr_dmalen = seg->mr_len;
1484         if (seg->mr_page)
1485                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1486                                 seg->mr_page, offset_in_page(seg->mr_offset),
1487                                 seg->mr_dmalen, seg->mr_dir);
1488         else
1489                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1490                                 seg->mr_offset,
1491                                 seg->mr_dmalen, seg->mr_dir);
1492         if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1493                 dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1494                         __func__,
1495                         (unsigned long long)seg->mr_dma,
1496                         seg->mr_offset, seg->mr_dmalen);
1497         }
1498 }
1499
1500 static void
1501 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1502 {
1503         if (seg->mr_page)
1504                 ib_dma_unmap_page(ia->ri_id->device,
1505                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1506         else
1507                 ib_dma_unmap_single(ia->ri_id->device,
1508                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1509 }
1510
1511 static int
1512 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1513                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1514                         struct rpcrdma_xprt *r_xprt)
1515 {
1516         struct rpcrdma_mr_seg *seg1 = seg;
1517         struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1518
1519         u8 key;
1520         int len, pageoff;
1521         int i, rc;
1522         int seg_len;
1523         u64 pa;
1524         int page_no;
1525
1526         pageoff = offset_in_page(seg1->mr_offset);
1527         seg1->mr_offset -= pageoff;     /* start of page */
1528         seg1->mr_len += pageoff;
1529         len = -pageoff;
1530         if (*nsegs > ia->ri_max_frmr_depth)
1531                 *nsegs = ia->ri_max_frmr_depth;
1532         for (page_no = i = 0; i < *nsegs;) {
1533                 rpcrdma_map_one(ia, seg, writing);
1534                 pa = seg->mr_dma;
1535                 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1536                         seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1537                                 page_list[page_no++] = pa;
1538                         pa += PAGE_SIZE;
1539                 }
1540                 len += seg->mr_len;
1541                 ++seg;
1542                 ++i;
1543                 /* Check for holes */
1544                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1545                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1546                         break;
1547         }
1548         dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1549                 __func__, seg1->mr_chunk.rl_mw, i);
1550
1551         if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1552                 dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
1553                         __func__,
1554                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1555                 /* Invalidate before using. */
1556                 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1557                 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1558                 invalidate_wr.next = &frmr_wr;
1559                 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1560                 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1561                 invalidate_wr.ex.invalidate_rkey =
1562                         seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1563                 DECR_CQCOUNT(&r_xprt->rx_ep);
1564                 post_wr = &invalidate_wr;
1565         } else
1566                 post_wr = &frmr_wr;
1567
1568         /* Bump the key */
1569         key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1570         ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1571
1572         /* Prepare FRMR WR */
1573         memset(&frmr_wr, 0, sizeof frmr_wr);
1574         frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1575         frmr_wr.opcode = IB_WR_FAST_REG_MR;
1576         frmr_wr.send_flags = IB_SEND_SIGNALED;
1577         frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1578         frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1579         frmr_wr.wr.fast_reg.page_list_len = page_no;
1580         frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1581         frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1582         BUG_ON(frmr_wr.wr.fast_reg.length < len);
1583         frmr_wr.wr.fast_reg.access_flags = (writing ?
1584                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1585                                 IB_ACCESS_REMOTE_READ);
1586         frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1587         DECR_CQCOUNT(&r_xprt->rx_ep);
1588
1589         rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1590
1591         if (rc) {
1592                 dprintk("RPC:       %s: failed ib_post_send for register,"
1593                         " status %i\n", __func__, rc);
1594                 while (i--)
1595                         rpcrdma_unmap_one(ia, --seg);
1596         } else {
1597                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1598                 seg1->mr_base = seg1->mr_dma + pageoff;
1599                 seg1->mr_nsegs = i;
1600                 seg1->mr_len = len;
1601         }
1602         *nsegs = i;
1603         return rc;
1604 }
1605
1606 static int
1607 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1608                         struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1609 {
1610         struct rpcrdma_mr_seg *seg1 = seg;
1611         struct ib_send_wr invalidate_wr, *bad_wr;
1612         int rc;
1613
1614         while (seg1->mr_nsegs--)
1615                 rpcrdma_unmap_one(ia, seg++);
1616
1617         memset(&invalidate_wr, 0, sizeof invalidate_wr);
1618         invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1619         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1620         invalidate_wr.send_flags = IB_SEND_SIGNALED;
1621         invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1622         DECR_CQCOUNT(&r_xprt->rx_ep);
1623
1624         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1625         if (rc)
1626                 dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1627                         " status %i\n", __func__, rc);
1628         return rc;
1629 }
1630
1631 static int
1632 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1633                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1634 {
1635         struct rpcrdma_mr_seg *seg1 = seg;
1636         u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1637         int len, pageoff, i, rc;
1638
1639         pageoff = offset_in_page(seg1->mr_offset);
1640         seg1->mr_offset -= pageoff;     /* start of page */
1641         seg1->mr_len += pageoff;
1642         len = -pageoff;
1643         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1644                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1645         for (i = 0; i < *nsegs;) {
1646                 rpcrdma_map_one(ia, seg, writing);
1647                 physaddrs[i] = seg->mr_dma;
1648                 len += seg->mr_len;
1649                 ++seg;
1650                 ++i;
1651                 /* Check for holes */
1652                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1653                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1654                         break;
1655         }
1656         rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1657                                 physaddrs, i, seg1->mr_dma);
1658         if (rc) {
1659                 dprintk("RPC:       %s: failed ib_map_phys_fmr "
1660                         "%u@0x%llx+%i (%d)... status %i\n", __func__,
1661                         len, (unsigned long long)seg1->mr_dma,
1662                         pageoff, i, rc);
1663                 while (i--)
1664                         rpcrdma_unmap_one(ia, --seg);
1665         } else {
1666                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1667                 seg1->mr_base = seg1->mr_dma + pageoff;
1668                 seg1->mr_nsegs = i;
1669                 seg1->mr_len = len;
1670         }
1671         *nsegs = i;
1672         return rc;
1673 }
1674
1675 static int
1676 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1677                         struct rpcrdma_ia *ia)
1678 {
1679         struct rpcrdma_mr_seg *seg1 = seg;
1680         LIST_HEAD(l);
1681         int rc;
1682
1683         list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1684         rc = ib_unmap_fmr(&l);
1685         while (seg1->mr_nsegs--)
1686                 rpcrdma_unmap_one(ia, seg++);
1687         if (rc)
1688                 dprintk("RPC:       %s: failed ib_unmap_fmr,"
1689                         " status %i\n", __func__, rc);
1690         return rc;
1691 }
1692
1693 static int
1694 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1695                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1696                         struct rpcrdma_xprt *r_xprt)
1697 {
1698         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1699                                   IB_ACCESS_REMOTE_READ);
1700         struct ib_mw_bind param;
1701         int rc;
1702
1703         *nsegs = 1;
1704         rpcrdma_map_one(ia, seg, writing);
1705         param.bind_info.mr = ia->ri_bind_mem;
1706         param.wr_id = 0ULL;     /* no send cookie */
1707         param.bind_info.addr = seg->mr_dma;
1708         param.bind_info.length = seg->mr_len;
1709         param.send_flags = 0;
1710         param.bind_info.mw_access_flags = mem_priv;
1711
1712         DECR_CQCOUNT(&r_xprt->rx_ep);
1713         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1714         if (rc) {
1715                 dprintk("RPC:       %s: failed ib_bind_mw "
1716                         "%u@0x%llx status %i\n",
1717                         __func__, seg->mr_len,
1718                         (unsigned long long)seg->mr_dma, rc);
1719                 rpcrdma_unmap_one(ia, seg);
1720         } else {
1721                 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1722                 seg->mr_base = param.bind_info.addr;
1723                 seg->mr_nsegs = 1;
1724         }
1725         return rc;
1726 }
1727
1728 static int
1729 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1730                         struct rpcrdma_ia *ia,
1731                         struct rpcrdma_xprt *r_xprt, void **r)
1732 {
1733         struct ib_mw_bind param;
1734         LIST_HEAD(l);
1735         int rc;
1736
1737         BUG_ON(seg->mr_nsegs != 1);
1738         param.bind_info.mr = ia->ri_bind_mem;
1739         param.bind_info.addr = 0ULL;    /* unbind */
1740         param.bind_info.length = 0;
1741         param.bind_info.mw_access_flags = 0;
1742         if (*r) {
1743                 param.wr_id = (u64) (unsigned long) *r;
1744                 param.send_flags = IB_SEND_SIGNALED;
1745                 INIT_CQCOUNT(&r_xprt->rx_ep);
1746         } else {
1747                 param.wr_id = 0ULL;
1748                 param.send_flags = 0;
1749                 DECR_CQCOUNT(&r_xprt->rx_ep);
1750         }
1751         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1752         rpcrdma_unmap_one(ia, seg);
1753         if (rc)
1754                 dprintk("RPC:       %s: failed ib_(un)bind_mw,"
1755                         " status %i\n", __func__, rc);
1756         else
1757                 *r = NULL;      /* will upcall on completion */
1758         return rc;
1759 }
1760
1761 static int
1762 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1763                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1764 {
1765         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1766                                   IB_ACCESS_REMOTE_READ);
1767         struct rpcrdma_mr_seg *seg1 = seg;
1768         struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1769         int len, i, rc = 0;
1770
1771         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1772                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1773         for (len = 0, i = 0; i < *nsegs;) {
1774                 rpcrdma_map_one(ia, seg, writing);
1775                 ipb[i].addr = seg->mr_dma;
1776                 ipb[i].size = seg->mr_len;
1777                 len += seg->mr_len;
1778                 ++seg;
1779                 ++i;
1780                 /* Check for holes */
1781                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1782                     offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1783                         break;
1784         }
1785         seg1->mr_base = seg1->mr_dma;
1786         seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1787                                 ipb, i, mem_priv, &seg1->mr_base);
1788         if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1789                 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1790                 dprintk("RPC:       %s: failed ib_reg_phys_mr "
1791                         "%u@0x%llx (%d)... status %i\n",
1792                         __func__, len,
1793                         (unsigned long long)seg1->mr_dma, i, rc);
1794                 while (i--)
1795                         rpcrdma_unmap_one(ia, --seg);
1796         } else {
1797                 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1798                 seg1->mr_nsegs = i;
1799                 seg1->mr_len = len;
1800         }
1801         *nsegs = i;
1802         return rc;
1803 }
1804
1805 static int
1806 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1807                         struct rpcrdma_ia *ia)
1808 {
1809         struct rpcrdma_mr_seg *seg1 = seg;
1810         int rc;
1811
1812         rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1813         seg1->mr_chunk.rl_mr = NULL;
1814         while (seg1->mr_nsegs--)
1815                 rpcrdma_unmap_one(ia, seg++);
1816         if (rc)
1817                 dprintk("RPC:       %s: failed ib_dereg_mr,"
1818                         " status %i\n", __func__, rc);
1819         return rc;
1820 }
1821
1822 int
1823 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1824                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1825 {
1826         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1827         int rc = 0;
1828
1829         switch (ia->ri_memreg_strategy) {
1830
1831 #if RPCRDMA_PERSISTENT_REGISTRATION
1832         case RPCRDMA_ALLPHYSICAL:
1833                 rpcrdma_map_one(ia, seg, writing);
1834                 seg->mr_rkey = ia->ri_bind_mem->rkey;
1835                 seg->mr_base = seg->mr_dma;
1836                 seg->mr_nsegs = 1;
1837                 nsegs = 1;
1838                 break;
1839 #endif
1840
1841         /* Registration using frmr registration */
1842         case RPCRDMA_FRMR:
1843                 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1844                 break;
1845
1846         /* Registration using fmr memory registration */
1847         case RPCRDMA_MTHCAFMR:
1848                 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1849                 break;
1850
1851         /* Registration using memory windows */
1852         case RPCRDMA_MEMWINDOWS_ASYNC:
1853         case RPCRDMA_MEMWINDOWS:
1854                 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1855                 break;
1856
1857         /* Default registration each time */
1858         default:
1859                 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1860                 break;
1861         }
1862         if (rc)
1863                 return -1;
1864
1865         return nsegs;
1866 }
1867
1868 int
1869 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1870                 struct rpcrdma_xprt *r_xprt, void *r)
1871 {
1872         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1873         int nsegs = seg->mr_nsegs, rc;
1874
1875         switch (ia->ri_memreg_strategy) {
1876
1877 #if RPCRDMA_PERSISTENT_REGISTRATION
1878         case RPCRDMA_ALLPHYSICAL:
1879                 BUG_ON(nsegs != 1);
1880                 rpcrdma_unmap_one(ia, seg);
1881                 rc = 0;
1882                 break;
1883 #endif
1884
1885         case RPCRDMA_FRMR:
1886                 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1887                 break;
1888
1889         case RPCRDMA_MTHCAFMR:
1890                 rc = rpcrdma_deregister_fmr_external(seg, ia);
1891                 break;
1892
1893         case RPCRDMA_MEMWINDOWS_ASYNC:
1894         case RPCRDMA_MEMWINDOWS:
1895                 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1896                 break;
1897
1898         default:
1899                 rc = rpcrdma_deregister_default_external(seg, ia);
1900                 break;
1901         }
1902         if (r) {
1903                 struct rpcrdma_rep *rep = r;
1904                 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1905                 rep->rr_func = NULL;
1906                 func(rep);      /* dereg done, callback now */
1907         }
1908         return nsegs;
1909 }
1910
1911 /*
1912  * Prepost any receive buffer, then post send.
1913  *
1914  * Receive buffer is donated to hardware, reclaimed upon recv completion.
1915  */
1916 int
1917 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1918                 struct rpcrdma_ep *ep,
1919                 struct rpcrdma_req *req)
1920 {
1921         struct ib_send_wr send_wr, *send_wr_fail;
1922         struct rpcrdma_rep *rep = req->rl_reply;
1923         int rc;
1924
1925         if (rep) {
1926                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1927                 if (rc)
1928                         goto out;
1929                 req->rl_reply = NULL;
1930         }
1931
1932         send_wr.next = NULL;
1933         send_wr.wr_id = 0ULL;   /* no send cookie */
1934         send_wr.sg_list = req->rl_send_iov;
1935         send_wr.num_sge = req->rl_niovs;
1936         send_wr.opcode = IB_WR_SEND;
1937         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1938                 ib_dma_sync_single_for_device(ia->ri_id->device,
1939                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1940                         DMA_TO_DEVICE);
1941         ib_dma_sync_single_for_device(ia->ri_id->device,
1942                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1943                 DMA_TO_DEVICE);
1944         ib_dma_sync_single_for_device(ia->ri_id->device,
1945                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1946                 DMA_TO_DEVICE);
1947
1948         if (DECR_CQCOUNT(ep) > 0)
1949                 send_wr.send_flags = 0;
1950         else { /* Provider must take a send completion every now and then */
1951                 INIT_CQCOUNT(ep);
1952                 send_wr.send_flags = IB_SEND_SIGNALED;
1953         }
1954
1955         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1956         if (rc)
1957                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1958                         rc);
1959 out:
1960         return rc;
1961 }
1962
1963 /*
1964  * (Re)post a receive buffer.
1965  */
1966 int
1967 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1968                      struct rpcrdma_ep *ep,
1969                      struct rpcrdma_rep *rep)
1970 {
1971         struct ib_recv_wr recv_wr, *recv_wr_fail;
1972         int rc;
1973
1974         recv_wr.next = NULL;
1975         recv_wr.wr_id = (u64) (unsigned long) rep;
1976         recv_wr.sg_list = &rep->rr_iov;
1977         recv_wr.num_sge = 1;
1978
1979         ib_dma_sync_single_for_cpu(ia->ri_id->device,
1980                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1981
1982         DECR_CQCOUNT(ep);
1983         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1984
1985         if (rc)
1986                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1987                         rc);
1988         return rc;
1989 }