From: Sagi Grimberg Date: Wed, 1 Oct 2014 11:02:01 +0000 (+0300) Subject: IB/iser: Fix DEVICE REMOVAL handling in the absence of iscsi daemon X-Git-Tag: firefly_0821_release~176^2~2998^2^3~23 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=c47a3c9ed5be167f49a6fd3f696dac03536282eb;p=firefly-linux-kernel-4.4.55.git IB/iser: Fix DEVICE REMOVAL handling in the absence of iscsi daemon iscsi daemon is in user-space, thus we can't rely on it to be invoked at connection teardown (if not running or does not receive CPU time). This patch addresses the issue by re-structuring iSER connection teardown logic and CM events handling. The CM events will dictate the RDMA resources destruction (ib_conn) and iser_conn is kept around as long as iscsi_conn is left around allowing iscsi/iser callbacks to continue after RDMA transport was destroyed. This patch introduces a separation in logic when handling CM events: - DISCONNECTED_HANDLER, ADDR_CHANGED This events indicate the start of teardown process. Actions: 1. Terminate the connection: rdma_disconnect (send DREQ/DREP) 2. Notify iSCSI of connection failure 3. Change state to TERMINATING 4. Poll for all flush errors to be consumed - TIMEWAIT_EXIT, DEVICE_REMOVAL These events indicate the final stage of termination process and we can free RDMA related resources. Actions: 1. Call disconnected handler (we are not guaranteed that DISCONNECTED event was invoked in the past) 2. Cleanup RDMA related resources 3. For DEVICE_REMOVAL return non-zero rc from cma_handler to implicitly destroy the cm_id (Can't rely on user-space, make sure we have forward progress) We replace flush_completion (indicate all flushes were consumed) with ib_completion (rdma resources were cleaned up). The iser_conn_release_work will wait for teardown completions: - conn_stop was completed (tasks were cleaned-up) - stop_completion - RDMA resources were destroyed - ib_completion And then will continue to free iser connection representation (iser_conn). Signed-off-by: Sagi Grimberg Signed-off-by: Ariel Nahum Signed-off-by: Roi Dayan Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index ec238b3bd278..95c484d0f881 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -370,9 +370,9 @@ struct iser_conn { unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ char name[ISER_OBJECT_NAME_SIZE]; struct work_struct release_work; - struct completion stop_completion; struct mutex state_mutex; - struct completion flush_completion; + struct completion stop_completion; + struct completion ib_completion; struct completion up_completion; struct list_head conn_list; /* entry in ig conn list */ @@ -442,7 +442,7 @@ void iser_conn_init(struct iser_conn *iser_conn); void iser_conn_release(struct iser_conn *iser_conn); -void iser_conn_terminate(struct iser_conn *iser_conn); +int iser_conn_terminate(struct iser_conn *iser_conn); void iser_release_work(struct work_struct *work); diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index e4299743c459..6170d06a8acc 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -44,6 +44,7 @@ static void iser_cq_tasklet_fn(unsigned long data); static void iser_cq_callback(struct ib_cq *cq, void *cq_context); +static int iser_drain_tx_cq(struct iser_device *device, int cq_index); static void iser_cq_event_callback(struct ib_event *cause, void *context) { @@ -573,11 +574,10 @@ void iser_release_work(struct work_struct *work) rc = wait_for_completion_timeout(&iser_conn->stop_completion, 30 * HZ); WARN_ON(rc == 0); - /* wait for the qp`s post send and post receive buffers to empty */ - rc = wait_for_completion_timeout(&iser_conn->flush_completion, 30 * HZ); - WARN_ON(rc == 0); - - iser_conn->state = ISER_CONN_DOWN; + rc = wait_for_completion_timeout(&iser_conn->ib_completion, 30 * HZ); + if (rc == 0) + iser_warn("conn %p, IB cleanup didn't complete in 30 " + "seconds, continue with release\n", iser_conn); mutex_lock(&iser_conn->state_mutex); iser_conn->state = ISER_CONN_DOWN; @@ -589,12 +589,16 @@ void iser_release_work(struct work_struct *work) /** * iser_free_ib_conn_res - release IB related resources * @iser_conn: iser connection struct + * @destroy_device: indicator if we need to try to release + * the iser device (only iscsi shutdown and DEVICE_REMOVAL + * will use this. * * This routine is called with the iser state mutex held * so the cm_id removal is out of here. It is Safe to * be invoked multiple times. */ -static void iser_free_ib_conn_res(struct iser_conn *iser_conn) +static void iser_free_ib_conn_res(struct iser_conn *iser_conn, + bool destroy_device) { struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; @@ -610,7 +614,7 @@ static void iser_free_ib_conn_res(struct iser_conn *iser_conn) ib_conn->qp = NULL; } - if (device != NULL) { + if (destroy_device && device != NULL) { iser_device_try_release(device); ib_conn->device = NULL; } @@ -629,7 +633,11 @@ void iser_conn_release(struct iser_conn *iser_conn) mutex_lock(&iser_conn->state_mutex); BUG_ON(iser_conn->state != ISER_CONN_DOWN); - iser_free_ib_conn_res(iser_conn); + /* + * In case we never got to bind stage, we still need to + * release IB resources (which is safe to call more than once). + */ + iser_free_ib_conn_res(iser_conn, true); mutex_unlock(&iser_conn->state_mutex); if (ib_conn->cma_id != NULL) { @@ -640,24 +648,69 @@ void iser_conn_release(struct iser_conn *iser_conn) kfree(iser_conn); } +/** + * iser_poll_for_flush_errors - Don't settle for less than all. + * @struct ib_conn: IB context of the connection + * + * This routine is called when the QP is in error state + * It polls the send CQ until all flush errors are consumed and + * returns when all flush errors were processed. + */ +static void iser_poll_for_flush_errors(struct ib_conn *ib_conn) +{ + struct iser_device *device = ib_conn->device; + int count = 0; + + while (ib_conn->post_recv_buf_count > 0 || + atomic_read(&ib_conn->post_send_buf_count) > 0) { + msleep(100); + if (atomic_read(&ib_conn->post_send_buf_count) > 0) + iser_drain_tx_cq(device, ib_conn->cq_index); + + count++; + /* Don't flood with prints */ + if (count % 30 == 0) + iser_dbg("post_recv %d post_send %d", + ib_conn->post_recv_buf_count, + atomic_read(&ib_conn->post_send_buf_count)); + } +} + /** * triggers start of the disconnect procedures and wait for them to be done + * Called with state mutex held */ -void iser_conn_terminate(struct iser_conn *iser_conn) +int iser_conn_terminate(struct iser_conn *iser_conn) { struct ib_conn *ib_conn = &iser_conn->ib_conn; int err = 0; - /* change the ib conn state only if the conn is UP, however always call - * rdma_disconnect since this is the only way to cause the CMA to change - * the QP state to ERROR + /* terminate the iser conn only if the conn state is UP */ + if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + return 0; + + iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state); + + /* suspend queuing of new iscsi commands */ + if (iser_conn->iscsi_conn) + iscsi_suspend_queue(iser_conn->iscsi_conn); + + /* + * In case we didn't already clean up the cma_id (peer initiated + * a disconnection), we need to Cause the CMA to change the QP + * state to ERROR. */ + if (ib_conn->cma_id) { + err = rdma_disconnect(ib_conn->cma_id); + if (err) + iser_err("Failed to disconnect, conn: 0x%p err %d\n", + iser_conn, err); + + iser_poll_for_flush_errors(ib_conn); + } - iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); - err = rdma_disconnect(ib_conn->cma_id); - if (err) - iser_err("Failed to disconnect, conn: 0x%p err %d\n", - iser_conn, err); + return 1; } /** @@ -780,34 +833,36 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) static void iser_disconnected_handler(struct rdma_cm_id *cma_id) { - struct iser_conn *iser_conn; - struct ib_conn *ib_conn = &iser_conn->ib_conn; - - iser_conn = (struct iser_conn *)cma_id->context; + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; - /* getting here when the state is UP means that the conn is being * - * terminated asynchronously from the iSCSI layer's perspective. */ - if (iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)){ + if (iser_conn_terminate(iser_conn)) { if (iser_conn->iscsi_conn) - iscsi_conn_failure(iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); else iser_err("iscsi_iser connection isn't bound\n"); } +} + +static void iser_cleanup_handler(struct rdma_cm_id *cma_id, + bool destroy_device) +{ + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; - /* Complete the termination process if no posts are pending. This code - * block also exists in iser_handle_comp_error(), but it is needed here - * for cases of no flushes at all, e.g. discovery over rdma. + /* + * We are not guaranteed that we visited disconnected_handler + * by now, call it here to be safe that we handle CM drep + * and flush errors. */ - if (ib_conn->post_recv_buf_count == 0 && - (atomic_read(&ib_conn->post_send_buf_count) == 0)) { - complete(&iser_conn->flush_completion); - } -} + iser_disconnected_handler(cma_id); + iser_free_ib_conn_res(iser_conn, destroy_device); + complete(&iser_conn->ib_completion); +}; static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { struct iser_conn *iser_conn; + int ret = 0; iser_conn = (struct iser_conn *)cma_id->context; iser_info("event %d status %d conn %p id %p\n", @@ -832,17 +887,29 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: - case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: - case RDMA_CM_EVENT_TIMEWAIT_EXIT: iser_disconnected_handler(cma_id); break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* + * we *must* destroy the device as we cannot rely + * on iscsid to be around to initiate error handling. + * also implicitly destroy the cma_id. + */ + iser_cleanup_handler(cma_id, true); + iser_conn->ib_conn.cma_id = NULL; + ret = 1; + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + iser_cleanup_handler(cma_id, false); + break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); break; } mutex_unlock(&iser_conn->state_mutex); - return 0; + + return ret; } void iser_conn_init(struct iser_conn *iser_conn) @@ -851,7 +918,7 @@ void iser_conn_init(struct iser_conn *iser_conn) iser_conn->ib_conn.post_recv_buf_count = 0; atomic_set(&iser_conn->ib_conn.post_send_buf_count, 0); init_completion(&iser_conn->stop_completion); - init_completion(&iser_conn->flush_completion); + init_completion(&iser_conn->ib_completion); init_completion(&iser_conn->up_completion); INIT_LIST_HEAD(&iser_conn->conn_list); spin_lock_init(&iser_conn->ib_conn.lock); @@ -1100,28 +1167,8 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc) static void iser_handle_comp_error(struct iser_tx_desc *desc, struct ib_conn *ib_conn) { - struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, - ib_conn); - if (desc && desc->type == ISCSI_TX_DATAOUT) kmem_cache_free(ig.desc_cache, desc); - - if (ib_conn->post_recv_buf_count == 0 && - atomic_read(&ib_conn->post_send_buf_count) == 0) { - /** - * getting here when the state is UP means that the conn is - * being terminated asynchronously from the iSCSI layer's - * perspective. It is safe to peek at the connection state - * since iscsi_conn_failure is allowed to be called twice. - **/ - if (iser_conn->state == ISER_CONN_UP) - iscsi_conn_failure(iser_conn->iscsi_conn, - ISCSI_ERR_CONN_FAILED); - - /* no more non completed posts to the QP, complete the - * termination process w.o worrying on disconnect event */ - complete(&iser_conn->flush_completion); - } } static int iser_drain_tx_cq(struct iser_device *device, int cq_index)