X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=net%2Fceph%2Fmessenger.c;h=b872db5c498941020c05c6c77a9729b70cace120;hb=4a8616920860920abaa51193146fe36b38ef09aa;hp=23073cff648111c2a279488d77940c8d48b4b782;hpb=26ce171915f348abd1f41da1ed139d93750d987f;p=firefly-linux-kernel-4.4.55.git diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 23073cff6481..b872db5c4989 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -29,7 +29,47 @@ * the sender. */ -/* State values for ceph_connection->sock_state; NEW is assumed to be 0 */ +/* + * We track the state of the socket on a given connection using + * values defined below. The transition to a new socket state is + * handled by a function which verifies we aren't coming from an + * unexpected state. + * + * -------- + * | NEW* | transient initial state + * -------- + * | con_sock_state_init() + * v + * ---------- + * | CLOSED | initialized, but no socket (and no + * ---------- TCP connection) + * ^ \ + * | \ con_sock_state_connecting() + * | ---------------------- + * | \ + * + con_sock_state_closed() \ + * |+--------------------------- \ + * | \ \ \ + * | ----------- \ \ + * | | CLOSING | socket event; \ \ + * | ----------- await close \ \ + * | ^ \ | + * | | \ | + * | + con_sock_state_closing() \ | + * | / \ | | + * | / --------------- | | + * | / \ v v + * | / -------------- + * | / -----------------| CONNECTING | socket created, TCP + * | | / -------------- connect initiated + * | | | con_sock_state_connected() + * | | v + * ------------- + * | CONNECTED | TCP connection established + * ------------- + * + * State values for ceph_connection->sock_state; NEW is assumed to be 0. + */ #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ @@ -37,6 +77,26 @@ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ +/* + * connection states + */ +#define CON_STATE_CLOSED 1 /* -> PREOPEN */ +#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ +#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ +#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ +#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ +#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ + +/* + * ceph_connection flag bits + */ +#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop + * messages on errors */ +#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ +#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -201,7 +261,8 @@ static void con_sock_state_closed(struct ceph_connection *con) old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && - old_state != CON_SOCK_STATE_CLOSING)) + old_state != CON_SOCK_STATE_CLOSING && + old_state != CON_SOCK_STATE_CONNECTING)) printk("%s: unexpected old state %d\n", __func__, old_state); } @@ -213,6 +274,9 @@ static void con_sock_state_closed(struct ceph_connection *con) static void ceph_sock_data_ready(struct sock *sk, int count_unused) { struct ceph_connection *con = sk->sk_user_data; + if (atomic_read(&con->msgr->stopping)) { + return; + } if (sk->sk_state != TCP_CLOSE_WAIT) { dout("%s on %p state = %lu, queueing work\n", __func__, @@ -233,7 +297,7 @@ static void ceph_sock_write_space(struct sock *sk) * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (test_bit(WRITE_PENDING, &con->flags)) { + if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); @@ -252,22 +316,14 @@ static void ceph_sock_state_change(struct sock *sk) dout("%s %p state = %lu sk_state = %u\n", __func__, con, con->state, sk->sk_state); - if (test_bit(CLOSED, &con->state)) - return; - switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); - if (test_and_set_bit(SOCK_CLOSED, &con->flags) == 0) { - if (test_bit(CONNECTING, &con->state)) - con->error_msg = "connection failed"; - else - con->error_msg = "socket closed"; - queue_con(con); - } + set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); + queue_con(con); break; case TCP_ESTABLISHED: dout("%s TCP_ESTABLISHED\n", __func__); @@ -397,11 +453,17 @@ static int con_close_socket(struct ceph_connection *con) dout("con_close_socket on %p sock %p\n", con, con->sock); if (!con->sock) return 0; - set_bit(SOCK_CLOSED, &con->state); rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); con->sock = NULL; - clear_bit(SOCK_CLOSED, &con->state); + + /* + * Forcibly clear the SOCK_CLOSED flag. It gets set + * independent of the connection mutex, and we could have + * received a socket close event before we had the chance to + * shut the socket down. + */ + clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); con_sock_state_closed(con); return rc; } @@ -414,7 +476,7 @@ static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); BUG_ON(msg->con == NULL); - ceph_con_put(msg->con); + msg->con->ops->put(msg->con); msg->con = NULL; ceph_msg_put(msg); @@ -440,7 +502,7 @@ static void reset_connection(struct ceph_connection *con) con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; - ceph_con_put(con); + con->ops->put(con); } con->connect_seq = 0; @@ -458,36 +520,42 @@ static void reset_connection(struct ceph_connection *con) */ void ceph_con_close(struct ceph_connection *con) { + mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr.in_addr)); - clear_bit(NEGOTIATING, &con->state); - clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ - set_bit(CLOSED, &con->state); + con->state = CON_STATE_CLOSED; - clear_bit(LOSSYTX, &con->flags); /* so we retry next connect */ - clear_bit(KEEPALIVE_PENDING, &con->flags); - clear_bit(WRITE_PENDING, &con->flags); + clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ + clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); - mutex_lock(&con->mutex); reset_connection(con); con->peer_global_seq = 0; cancel_delayed_work(&con->work); + con_close_socket(con); mutex_unlock(&con->mutex); - queue_con(con); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ -void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +void ceph_con_open(struct ceph_connection *con, + __u8 entity_type, __u64 entity_num, + struct ceph_entity_addr *addr) { + mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); - set_bit(OPENING, &con->state); - WARN_ON(!test_and_clear_bit(CLOSED, &con->state)); + + BUG_ON(con->state != CON_STATE_CLOSED); + con->state = CON_STATE_PREOPEN; + + con->peer_name.type = (__u8) entity_type; + con->peer_name.num = cpu_to_le64(entity_num); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ + mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); @@ -500,55 +568,27 @@ bool ceph_con_opened(struct ceph_connection *con) return con->connect_seq > 0; } -/* - * generic get/put - */ -struct ceph_connection *ceph_con_get(struct ceph_connection *con) -{ - int nref = __atomic_add_unless(&con->nref, 1, 0); - - dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1); - - return nref ? con : NULL; -} - -void ceph_con_put(struct ceph_connection *con) -{ - int nref = atomic_dec_return(&con->nref); - - BUG_ON(nref < 0); - if (nref == 0) { - BUG_ON(con->sock); - kfree(con); - } - dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref); -} - /* * initialize a new connection. */ void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, - struct ceph_messenger *msgr, __u8 entity_type, __u64 entity_num) + struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); con->private = private; con->ops = ops; - atomic_set(&con->nref, 1); con->msgr = msgr; con_sock_state_init(con); - con->peer_name.type = (__u8) entity_type; - con->peer_name.num = cpu_to_le64(entity_num); - mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); - set_bit(CLOSED, &con->state); + con->state = CON_STATE_CLOSED; } EXPORT_SYMBOL(ceph_con_init); @@ -590,6 +630,53 @@ static void con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + +static void prepare_write_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + BUG_ON(!msg); + BUG_ON(!msg->hdr.data_len); + + /* initialize page iterator */ + con->out_msg_pos.page = 0; + if (msg->pages) + con->out_msg_pos.page_pos = msg->page_alignment; + else + con->out_msg_pos.page_pos = 0; +#ifdef CONFIG_BLOCK + if (msg->bio) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + con->out_msg_pos.data_pos = 0; + con->out_msg_pos.did_page_crc = false; + con->out_more = 1; /* data + footer will follow */ +} + /* * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. @@ -599,6 +686,8 @@ static void prepare_write_message_footer(struct ceph_connection *con) struct ceph_msg *m = con->out_msg; int v = con->out_kvec_left; + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + dout("prepare_write_message_footer %p\n", con); con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; @@ -648,10 +737,6 @@ static void prepare_write_message(struct ceph_connection *con) m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; } -#ifdef CONFIG_BLOCK - else - m->bio_iter = NULL; -#endif dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", m, con->out_seq, le16_to_cpu(m->hdr.type), @@ -672,7 +757,7 @@ static void prepare_write_message(struct ceph_connection *con) /* fill in crc (except data pages), footer */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; + con->out_msg->footer.flags = 0; crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); @@ -682,28 +767,19 @@ static void prepare_write_message(struct ceph_connection *con) con->out_msg->footer.middle_crc = cpu_to_le32(crc); } else con->out_msg->footer.middle_crc = 0; - con->out_msg->footer.data_crc = 0; - dout("prepare_write_message front_crc %u data_crc %u\n", + dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); /* is there a data payload? */ - if (le32_to_cpu(m->hdr.data_len) > 0) { - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (m->pages) - con->out_msg_pos.page_pos = m->page_alignment; - else - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ - } else { + con->out_msg->footer.data_crc = 0; + if (m->hdr.data_len) + prepare_write_message_data(con); + else /* no, queue up footer too and be done */ prepare_write_message_footer(con); - } - set_bit(WRITE_PENDING, &con->flags); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -724,7 +800,7 @@ static void prepare_write_ack(struct ceph_connection *con) &con->out_temp_ack); con->out_more = 1; /* more will follow.. eventually.. */ - set_bit(WRITE_PENDING, &con->flags); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -735,7 +811,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); - set_bit(WRITE_PENDING, &con->flags); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } /* @@ -750,27 +826,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection if (!con->ops->get_authorizer) { con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; con->out_connect.authorizer_len = 0; - return NULL; } /* Can't hold the mutex while getting authorizer */ - mutex_unlock(&con->mutex); - auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); - mutex_lock(&con->mutex); if (IS_ERR(auth)) return auth; - if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->flags)) + if (con->state != CON_STATE_NEGOTIATING) return ERR_PTR(-EAGAIN); con->auth_reply_buf = auth->authorizer_reply_buf; con->auth_reply_buf_len = auth->authorizer_reply_buf_len; - - return auth; } @@ -784,7 +854,7 @@ static void prepare_write_banner(struct ceph_connection *con) &con->msgr->my_enc_addr); con->out_more = 0; - set_bit(WRITE_PENDING, &con->flags); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); } static int prepare_write_connect(struct ceph_connection *con) @@ -827,6 +897,7 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.authorizer_len = auth ? cpu_to_le32(auth->authorizer_buf_len) : 0; + con_out_kvec_reset(con); con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); if (auth && auth->authorizer_buf_len) @@ -834,7 +905,7 @@ static int prepare_write_connect(struct ceph_connection *con) auth->authorizer_buf); con->out_more = 0; - set_bit(WRITE_PENDING, &con->flags); + set_bit(CON_FLAG_WRITE_PENDING, &con->flags); return 0; } @@ -882,30 +953,34 @@ out: return ret; /* done! */ } -#ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +static void out_msg_pos_next(struct ceph_connection *con, struct page *page, + size_t len, size_t sent, bool in_trail) { - if (!bio) { - *iter = NULL; - *seg = 0; - return; - } - *iter = bio; - *seg = bio->bi_idx; -} + struct ceph_msg *msg = con->out_msg; -static void iter_bio_next(struct bio **bio_iter, int *seg) -{ - if (*bio_iter == NULL) - return; + BUG_ON(!msg); + BUG_ON(!sent); - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + con->out_msg_pos.data_pos += sent; + con->out_msg_pos.page_pos += sent; + if (sent < len) + return; - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); -} + BUG_ON(sent != len); + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.page++; + con->out_msg_pos.did_page_crc = false; + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) + list_move_tail(&page->lru, + &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); #endif +} /* * Write as much message data payload as we can. If we finish, queue @@ -922,41 +997,36 @@ static int write_partial_msg_pages(struct ceph_connection *con) bool do_datacrc = !con->msgr->nocrc; int ret; int total_max_write; - int in_trail = 0; - size_t trail_len = (msg->trail ? msg->trail->length : 0); + bool in_trail = false; + const size_t trail_len = (msg->trail ? msg->trail->length : 0); + const size_t trail_off = data_len - trail_len; dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, + con, msg, con->out_msg_pos.page, msg->nr_pages, con->out_msg_pos.page_pos); -#ifdef CONFIG_BLOCK - if (msg->bio && !msg->bio_iter) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; int max_write = PAGE_SIZE; int bio_offset = 0; - total_max_write = data_len - trail_len - - con->out_msg_pos.data_pos; - - /* - * if we are calculating the data crc (the default), we need - * to map the page. if our pages[] has been revoked, use the - * zero page. - */ - - /* have we reached the trail part of the data? */ - if (con->out_msg_pos.data_pos >= data_len - trail_len) { - in_trail = 1; + in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; + if (!in_trail) + total_max_write = trail_off - con->out_msg_pos.data_pos; + if (in_trail) { total_max_write = data_len - con->out_msg_pos.data_pos; page = list_first_entry(&msg->trail->head, struct page, lru); - max_write = PAGE_SIZE; } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; } else if (msg->pagelist) { @@ -979,15 +1049,14 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (do_datacrc && !con->out_msg_pos.did_page_crc) { void *base; - u32 crc; - u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + u32 crc = le32_to_cpu(msg->footer.data_crc); char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); base = kaddr + con->out_msg_pos.page_pos + bio_offset; - crc = crc32c(tmpcrc, base, len); - con->out_msg->footer.data_crc = cpu_to_le32(crc); + crc = crc32c(crc, base, len); + msg->footer.data_crc = cpu_to_le32(crc); con->out_msg_pos.did_page_crc = true; } ret = ceph_tcp_sendpage(con->sock, page, @@ -1000,30 +1069,14 @@ static int write_partial_msg_pages(struct ceph_connection *con) if (ret <= 0) goto out; - con->out_msg_pos.data_pos += ret; - con->out_msg_pos.page_pos += ret; - if (ret == len) { - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif - } + out_msg_pos_next(con, page, len, (size_t) ret, in_trail); } dout("write_partial_msg_pages %p msg %p done\n", con, msg); /* prepare and queue up footer, too */ if (!do_datacrc) - con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); prepare_write_message_footer(con); ret = 1; @@ -1436,15 +1489,14 @@ static int process_banner(struct ceph_connection *con) ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } - set_bit(NEGOTIATING, &con->state); - prepare_read_connect(con); return 0; } static void fail_protocol(struct ceph_connection *con) { reset_connection(con); - set_bit(CLOSED, &con->state); /* in case there's queued work */ + BUG_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_CLOSED; } static int process_connect(struct ceph_connection *con) @@ -1487,7 +1539,6 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1503,12 +1554,11 @@ static int process_connect(struct ceph_connection *con) * dropped messages. */ dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_connect.connect_seq)); + le32_to_cpu(con->in_reply.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); - con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1520,8 +1570,7 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) + if (con->state != CON_STATE_NEGOTIATING) return -EAGAIN; break; @@ -1530,11 +1579,10 @@ static int process_connect(struct ceph_connection *con) * If we sent a smaller connect_seq than the peer has, try * again with a larger value. */ - dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_connect.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); - con_out_kvec_reset(con); + le32_to_cpu(con->in_reply.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1548,10 +1596,9 @@ static int process_connect(struct ceph_connection *con) */ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", con->peer_global_seq, - le32_to_cpu(con->in_connect.global_seq)); + le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, - le32_to_cpu(con->in_connect.global_seq)); - con_out_kvec_reset(con); + le32_to_cpu(con->in_reply.global_seq)); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1569,7 +1616,10 @@ static int process_connect(struct ceph_connection *con) fail_protocol(con); return -1; } - clear_bit(CONNECTING, &con->state); + + BUG_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_OPEN; + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; @@ -1581,7 +1631,9 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.connect_seq)); if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - set_bit(LOSSYTX, &con->flags); + set_bit(CON_FLAG_LOSSYTX, &con->flags); + + con->delay = 0; /* reset backoff memory */ prepare_read_tag(con); break; @@ -1712,9 +1764,6 @@ static int read_partial_message_bio(struct ceph_connection *con, void *p; int ret, left; - if (IS_ERR(bv)) - return PTR_ERR(bv); - left = min((int)(data_len - con->in_msg_pos.data_pos), (int)(bv->bv_len - con->in_msg_pos.page_pos)); @@ -1831,6 +1880,11 @@ static int read_partial_message(struct ceph_connection *con) else con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; + +#ifdef CONFIG_BLOCK + if (m->bio) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif } /* front */ @@ -1847,10 +1901,6 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; } -#ifdef CONFIG_BLOCK - if (m->bio && !m->bio_iter) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif /* (page) data */ while (con->in_msg_pos.data_pos < data_len) { @@ -1861,7 +1911,7 @@ static int read_partial_message(struct ceph_connection *con) return ret; #ifdef CONFIG_BLOCK } else if (m->bio) { - + BUG_ON(!m->bio_iter); ret = read_partial_message_bio(con, &m->bio_iter, &m->bio_seg, data_len, do_datacrc); @@ -1919,7 +1969,7 @@ static void process_message(struct ceph_connection *con) con->in_msg->con = NULL; msg = con->in_msg; con->in_msg = NULL; - ceph_con_put(con); + con->ops->put(con); /* if first message, set peer_name */ if (con->peer_name.type == 0) @@ -1951,22 +2001,18 @@ static int try_write(struct ceph_connection *con) { int ret = 1; - dout("try_write start %p state %lu nref %d\n", con, con->state, - atomic_read(&con->nref)); + dout("try_write start %p state %lu\n", con, con->state); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); /* open the socket first? */ - if (con->sock == NULL) { - clear_bit(NEGOTIATING, &con->state); - set_bit(CONNECTING, &con->state); + if (con->state == CON_STATE_PREOPEN) { + BUG_ON(con->sock); + con->state = CON_STATE_CONNECTING; con_out_kvec_reset(con); prepare_write_banner(con); - ret = prepare_write_connect(con); - if (ret < 0) - goto out; prepare_read_banner(con); BUG_ON(con->in_msg); @@ -2014,7 +2060,7 @@ more_kvec: } do_next: - if (!test_bit(CONNECTING, &con->state)) { + if (con->state == CON_STATE_OPEN) { /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); @@ -2024,14 +2070,15 @@ do_next: prepare_write_ack(con); goto more; } - if (test_and_clear_bit(KEEPALIVE_PENDING, &con->flags)) { + if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, + &con->flags)) { prepare_write_keepalive(con); goto more; } } /* Nothing to do! */ - clear_bit(WRITE_PENDING, &con->flags); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); dout("try_write nothing else to write.\n"); ret = 0; out: @@ -2048,38 +2095,42 @@ static int try_read(struct ceph_connection *con) { int ret = -1; - if (!con->sock) - return 0; - - if (test_bit(STANDBY, &con->state)) +more: + dout("try_read start on %p state %lu\n", con, con->state); + if (con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN) return 0; - dout("try_read start on %p\n", con); + BUG_ON(!con->sock); -more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); - /* - * process_connect and process_message drop and re-take - * con->mutex. make sure we handle a racing close or reopen. - */ - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) { - ret = -EAGAIN; + if (con->state == CON_STATE_CONNECTING) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + BUG_ON(con->state != CON_STATE_CONNECTING); + con->state = CON_STATE_NEGOTIATING; + + /* Banner is good, exchange connection info */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ goto out; } - if (test_bit(CONNECTING, &con->state)) { - if (!test_bit(NEGOTIATING, &con->state)) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto out; - ret = process_banner(con); - if (ret < 0) - goto out; - } + if (con->state == CON_STATE_NEGOTIATING) { + dout("try_read negotiating\n"); ret = read_partial_connect(con); if (ret <= 0) goto out; @@ -2089,6 +2140,8 @@ more: goto more; } + BUG_ON(con->state != CON_STATE_OPEN); + if (con->in_base_pos < 0) { /* * skipping + discarding content. @@ -2122,7 +2175,8 @@ more: prepare_read_ack(con); break; case CEPH_MSGR_TAG_CLOSE: - set_bit(CLOSED, &con->state); /* fixme */ + con_close_socket(con); + con->state = CON_STATE_CLOSED; goto out; default: goto bad_tag; @@ -2197,7 +2251,26 @@ static void con_work(struct work_struct *work) mutex_lock(&con->mutex); restart: - if (test_and_clear_bit(BACKOFF, &con->flags)) { + if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) { + switch (con->state) { + case CON_STATE_CONNECTING: + con->error_msg = "connection failed"; + break; + case CON_STATE_NEGOTIATING: + con->error_msg = "negotiation failed"; + break; + case CON_STATE_OPEN: + con->error_msg = "socket closed"; + break; + default: + dout("unrecognized con state %d\n", (int)con->state); + con->error_msg = "unrecognized con state"; + BUG(); + } + goto fault; + } + + if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, round_jiffies_relative(con->delay))) { @@ -2211,35 +2284,35 @@ restart: } } - if (test_bit(STANDBY, &con->state)) { + if (con->state == CON_STATE_STANDBY) { dout("con_work %p STANDBY\n", con); goto done; } - if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ - dout("con_work CLOSED\n"); - con_close_socket(con); + if (con->state == CON_STATE_CLOSED) { + dout("con_work %p CLOSED\n", con); + BUG_ON(con->sock); goto done; } - if (test_and_clear_bit(OPENING, &con->state)) { - /* reopen w/ new peer */ + if (con->state == CON_STATE_PREOPEN) { dout("con_work OPENING\n"); - con_close_socket(con); + BUG_ON(con->sock); } - if (test_and_clear_bit(SOCK_CLOSED, &con->flags)) - goto fault; - ret = try_read(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on read"; goto fault; + } ret = try_write(con); if (ret == -EAGAIN) goto restart; - if (ret < 0) + if (ret < 0) { + con->error_msg = "socket error on write"; goto fault; + } done: mutex_unlock(&con->mutex); @@ -2260,28 +2333,31 @@ fault: */ static void ceph_fault(struct ceph_connection *con) { + mutex_lock(&con->mutex); + pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); - if (test_bit(LOSSYTX, &con->flags)) { - dout("fault on LOSSYTX channel\n"); - goto out; - } - - mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state)) - goto out_unlock; + BUG_ON(con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN); con_close_socket(con); + if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { + dout("fault on LOSSYTX channel, marking CLOSED\n"); + con->state = CON_STATE_CLOSED; + goto out_unlock; + } + if (con->in_msg) { BUG_ON(con->in_msg->con != con); con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; - ceph_con_put(con); + con->ops->put(con); } /* Requeue anything that hasn't been acked */ @@ -2290,12 +2366,13 @@ static void ceph_fault(struct ceph_connection *con) /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !test_bit(KEEPALIVE_PENDING, &con->flags)) { + !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - clear_bit(WRITE_PENDING, &con->flags); - set_bit(STANDBY, &con->state); + clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); + con->state = CON_STATE_STANDBY; } else { /* retry after a delay. */ + con->state = CON_STATE_PREOPEN; if (con->delay == 0) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) @@ -2316,13 +2393,12 @@ static void ceph_fault(struct ceph_connection *con) * that when con_work restarts we schedule the * delay then. */ - set_bit(BACKOFF, &con->flags); + set_bit(CON_FLAG_BACKOFF, &con->flags); } } out_unlock: mutex_unlock(&con->mutex); -out: /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. @@ -2361,6 +2437,8 @@ void ceph_messenger_init(struct ceph_messenger *msgr, encode_my_addr(msgr); msgr->nocrc = nocrc; + atomic_set(&msgr->stopping, 0); + dout("%s %p\n", __func__, msgr); } EXPORT_SYMBOL(ceph_messenger_init); @@ -2368,13 +2446,12 @@ EXPORT_SYMBOL(ceph_messenger_init); static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ - if (test_and_clear_bit(STANDBY, &con->state)) { - mutex_lock(&con->mutex); + if (con->state == CON_STATE_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); + con->state = CON_STATE_PREOPEN; con->connect_seq++; - WARN_ON(test_bit(WRITE_PENDING, &con->flags)); - WARN_ON(test_bit(KEEPALIVE_PENDING, &con->flags)); - mutex_unlock(&con->mutex); + WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); + WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); } } @@ -2383,24 +2460,22 @@ static void clear_standby(struct ceph_connection *con) */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { - if (test_bit(CLOSED, &con->state)) { - dout("con_send %p closed, dropping %p\n", con, msg); - ceph_msg_put(msg); - return; - } - /* set src+dst */ msg->hdr.src = con->msgr->inst.name; - BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); - msg->needs_out_seq = true; - /* queue */ mutex_lock(&con->mutex); + if (con->state == CON_STATE_CLOSED) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + mutex_unlock(&con->mutex); + return; + } + BUG_ON(msg->con != NULL); - msg->con = ceph_con_get(con); + msg->con = con->ops->get(con); BUG_ON(msg->con == NULL); BUG_ON(!list_empty(&msg->list_head)); @@ -2411,12 +2486,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); + + clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ - clear_standby(con); - if (test_and_set_bit(WRITE_PENDING, &con->flags) == 0) + if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -2436,7 +2512,7 @@ void ceph_msg_revoke(struct ceph_msg *msg) dout("%s %p msg %p - was on queue\n", __func__, con, msg); list_del_init(&msg->list_head); BUG_ON(msg->con == NULL); - ceph_con_put(msg->con); + msg->con->ops->put(msg->con); msg->con = NULL; msg->hdr.seq = 0; @@ -2502,9 +2578,11 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg) void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); + mutex_lock(&con->mutex); clear_standby(con); - if (test_and_set_bit(KEEPALIVE_PENDING, &con->flags) == 0 && - test_and_set_bit(WRITE_PENDING, &con->flags) == 0) + mutex_unlock(&con->mutex); + if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && + test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); @@ -2646,7 +2724,7 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con, con->in_msg = con->ops->alloc_msg(con, hdr, &skip); mutex_lock(&con->mutex); if (con->in_msg) { - con->in_msg->con = ceph_con_get(con); + con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); } if (skip) @@ -2662,7 +2740,7 @@ static bool ceph_con_in_msg_alloc(struct ceph_connection *con, type, front_len); return false; } - con->in_msg->con = ceph_con_get(con); + con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); }