tcp: TCP Fast Open Server - main code path
authorJerry Chu <hkchu@google.com>
Fri, 31 Aug 2012 12:29:13 +0000 (12:29 +0000)
committerDavid S. Miller <davem@davemloft.net>
Sat, 1 Sep 2012 00:02:19 +0000 (20:02 -0400)
This patch adds the main processing path to complete the TFO server
patches.

A TFO request (i.e., SYN+data packet with a TFO cookie option) first
gets processed in tcp_v4_conn_request(). If it passes the various TFO
checks by tcp_fastopen_check(), a child socket will be created right
away to be accepted by applications, rather than waiting for the 3WHS
to finish.

In additon to the use of TFO cookie, a simple max_qlen based scheme
is put in place to fend off spoofed TFO attack.

When a valid ACK comes back to tcp_rcv_state_process(), it will cause
the state of the child socket to switch from either TCP_SYN_RECV to
TCP_ESTABLISHED, or TCP_FIN_WAIT1 to TCP_FIN_WAIT2. At this time
retransmission will resume for any unack'ed (data, FIN,...) segments.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c

index d47d5fe8f3f080f59d88316e110b4a873cd4e55b..8c304a4007987c9794f851acc68c056a9b41ed78 100644 (file)
@@ -3127,6 +3127,12 @@ void tcp_rearm_rto(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
+       /* If the retrans timer is currently being used by Fast Open
+        * for SYN-ACK retrans purpose, stay put.
+        */
+       if (tp->fastopen_rsk)
+               return;
+
        if (!tp->packets_out) {
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
        } else {
@@ -5895,7 +5901,9 @@ discard:
                tcp_send_synack(sk);
 #if 0
                /* Note, we could accept data and URG from this segment.
-                * There are no obstacles to make this.
+                * There are no obstacles to make this (except that we must
+                * either change tcp_recvmsg() to prevent it from returning data
+                * before 3WHS completes per RFC793, or employ TCP Fast Open).
                 *
                 * However, if we ignore data in ACKless segments sometimes,
                 * we have no reasons to accept it sometimes.
@@ -5935,6 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+       struct request_sock *req;
        int queued = 0;
 
        tp->rx_opt.saw_tstamp = 0;
@@ -5990,7 +5999,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                return 0;
        }
 
-       if (!tcp_validate_incoming(sk, skb, th, 0))
+       req = tp->fastopen_rsk;
+       if (req != NULL) {
+               BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                   sk->sk_state != TCP_FIN_WAIT1);
+
+               if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+                       goto discard;
+       } else if (!tcp_validate_incoming(sk, skb, th, 0))
                return 0;
 
        /* step 5: check the ACK field */
@@ -6000,7 +6016,22 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                switch (sk->sk_state) {
                case TCP_SYN_RECV:
                        if (acceptable) {
-                               tp->copied_seq = tp->rcv_nxt;
+                               /* Once we leave TCP_SYN_RECV, we no longer
+                                * need req so release it.
+                                */
+                               if (req) {
+                                       reqsk_fastopen_remove(sk, req, false);
+                               } else {
+                                       /* Make sure socket is routed, for
+                                        * correct metrics.
+                                        */
+                                       icsk->icsk_af_ops->rebuild_header(sk);
+                                       tcp_init_congestion_control(sk);
+
+                                       tcp_mtup_init(sk);
+                                       tcp_init_buffer_space(sk);
+                                       tp->copied_seq = tp->rcv_nxt;
+                               }
                                smp_mb();
                                tcp_set_state(sk, TCP_ESTABLISHED);
                                sk->sk_state_change(sk);
@@ -6022,23 +6053,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                if (tp->rx_opt.tstamp_ok)
                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
 
-                               /* Make sure socket is routed, for
-                                * correct metrics.
-                                */
-                               icsk->icsk_af_ops->rebuild_header(sk);
-
-                               tcp_init_metrics(sk);
-
-                               tcp_init_congestion_control(sk);
+                               if (req) {
+                                       /* Re-arm the timer because data may
+                                        * have been sent out. This is similar
+                                        * to the regular data transmission case
+                                        * when new data has just been ack'ed.
+                                        *
+                                        * (TFO) - we could try to be more
+                                        * aggressive and retranmitting any data
+                                        * sooner based on when they were sent
+                                        * out.
+                                        */
+                                       tcp_rearm_rto(sk);
+                               } else
+                                       tcp_init_metrics(sk);
 
                                /* Prevent spurious tcp_cwnd_restart() on
                                 * first data packet.
                                 */
                                tp->lsndtime = tcp_time_stamp;
 
-                               tcp_mtup_init(sk);
                                tcp_initialize_rcv_mss(sk);
-                               tcp_init_buffer_space(sk);
                                tcp_fast_path_on(tp);
                        } else {
                                return 1;
@@ -6046,6 +6081,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        break;
 
                case TCP_FIN_WAIT1:
+                       /* If we enter the TCP_FIN_WAIT1 state and we are a
+                        * Fast Open socket and this is the first acceptable
+                        * ACK we have received, this would have acknowledged
+                        * our SYNACK so stop the SYNACK timer.
+                        */
+                       if (acceptable && req != NULL) {
+                               /* We no longer need the request sock. */
+                               reqsk_fastopen_remove(sk, req, false);
+                               tcp_rearm_rto(sk);
+                       }
                        if (tp->snd_una == tp->write_seq) {
                                struct dst_entry *dst;
 
index bb148dee1edd91035a1c6d1d6a613ff8c18fc164..e64abed249ccabd37660e4f707b9e488bade692a 100644 (file)
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        const int code = icmp_hdr(icmp_skb)->code;
        struct sock *sk;
        struct sk_buff *skb;
+       struct request_sock *req;
        __u32 seq;
        __u32 remaining;
        int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 
        icsk = inet_csk(sk);
        tp = tcp_sk(sk);
+       req = tp->fastopen_rsk;
        seq = ntohl(th->seq);
        if (sk->sk_state != TCP_LISTEN &&
-           !between(seq, tp->snd_una, tp->snd_nxt)) {
+           !between(seq, tp->snd_una, tp->snd_nxt) &&
+           (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
+               /* For a Fast Open socket, allow seq to be snt_isn. */
                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                    !icsk->icsk_backoff)
                        break;
 
+               /* XXX (TFO) - revisit the following logic for TFO */
+
                if (sock_owned_by_user(sk))
                        break;
 
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                goto out;
        }
 
+       /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
+        * than following the TCP_SYN_RECV case and closing the socket,
+        * we ignore the ICMP error and keep trying like a fully established
+        * socket. Is this the right thing to do?
+        */
+       if (req && req->sk == NULL)
+               goto out;
+
        switch (sk->sk_state) {
                struct request_sock *req, **prev;
        case TCP_LISTEN:
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:  /* Cannot happen.
-                              It can f.e. if SYNs crossed.
+                              It can f.e. if SYNs crossed,
+                              or Fast Open.
                             */
                if (!sock_owned_by_user(sk)) {
                        sk->sk_err = err;
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
 {
-       tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
-                       tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
+       /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+        * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+        */
+       tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+                       tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+                       tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
                        req->ts_recent,
                        0,
                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 };
 #endif
 
+static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
+                              struct request_sock *req,
+                              struct tcp_fastopen_cookie *foc,
+                              struct tcp_fastopen_cookie *valid_foc)
+{
+       bool skip_cookie = false;
+       struct fastopen_queue *fastopenq;
+
+       if (likely(!fastopen_cookie_present(foc))) {
+               /* See include/net/tcp.h for the meaning of these knobs */
+               if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
+                   ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
+                   (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
+                       skip_cookie = true; /* no cookie to validate */
+               else
+                       return false;
+       }
+       fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+       /* A FO option is present; bump the counter. */
+       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
+
+       /* Make sure the listener has enabled fastopen, and we don't
+        * exceed the max # of pending TFO requests allowed before trying
+        * to validating the cookie in order to avoid burning CPU cycles
+        * unnecessarily.
+        *
+        * XXX (TFO) - The implication of checking the max_qlen before
+        * processing a cookie request is that clients can't differentiate
+        * between qlen overflow causing Fast Open to be disabled
+        * temporarily vs a server not supporting Fast Open at all.
+        */
+       if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
+           fastopenq == NULL || fastopenq->max_qlen == 0)
+               return false;
+
+       if (fastopenq->qlen >= fastopenq->max_qlen) {
+               struct request_sock *req1;
+               spin_lock(&fastopenq->lock);
+               req1 = fastopenq->rskq_rst_head;
+               if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+                       spin_unlock(&fastopenq->lock);
+                       NET_INC_STATS_BH(sock_net(sk),
+                           LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+                       /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
+                       foc->len = -1;
+                       return false;
+               }
+               fastopenq->rskq_rst_head = req1->dl_next;
+               fastopenq->qlen--;
+               spin_unlock(&fastopenq->lock);
+               reqsk_free(req1);
+       }
+       if (skip_cookie) {
+               tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+               return true;
+       }
+       if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
+               if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
+                       tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+                       if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
+                           memcmp(&foc->val[0], &valid_foc->val[0],
+                           TCP_FASTOPEN_COOKIE_SIZE) != 0)
+                               return false;
+                       valid_foc->len = -1;
+               }
+               /* Acknowledge the data received from the peer. */
+               tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+               return true;
+       } else if (foc->len == 0) { /* Client requesting a cookie */
+               tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+               NET_INC_STATS_BH(sock_net(sk),
+                   LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+       } else {
+               /* Client sent a cookie with wrong size. Treat it
+                * the same as invalid and return a valid one.
+                */
+               tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
+       }
+       return false;
+}
+
+static int tcp_v4_conn_req_fastopen(struct sock *sk,
+                                   struct sk_buff *skb,
+                                   struct sk_buff *skb_synack,
+                                   struct request_sock *req,
+                                   struct request_values *rvp)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+       const struct inet_request_sock *ireq = inet_rsk(req);
+       struct sock *child;
+
+       req->retrans = 0;
+       req->sk = NULL;
+
+       child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+       if (child == NULL) {
+               NET_INC_STATS_BH(sock_net(sk),
+                                LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+               kfree_skb(skb_synack);
+               return -1;
+       }
+       ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+                       ireq->rmt_addr, ireq->opt);
+       /* XXX (TFO) - is it ok to ignore error and continue? */
+
+       spin_lock(&queue->fastopenq->lock);
+       queue->fastopenq->qlen++;
+       spin_unlock(&queue->fastopenq->lock);
+
+       /* Initialize the child socket. Have to fix some values to take
+        * into account the child is a Fast Open socket and is created
+        * only out of the bits carried in the SYN packet.
+        */
+       tp = tcp_sk(child);
+
+       tp->fastopen_rsk = req;
+       /* Do a hold on the listner sk so that if the listener is being
+        * closed, the child that has been accepted can live on and still
+        * access listen_lock.
+        */
+       sock_hold(sk);
+       tcp_rsk(req)->listener = sk;
+
+       /* RFC1323: The window in SYN & SYN/ACK segments is never
+        * scaled. So correct it appropriately.
+        */
+       tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+
+       /* Activate the retrans timer so that SYNACK can be retransmitted.
+        * The request socket is not added to the SYN table of the parent
+        * because it's been added to the accept queue directly.
+        */
+       inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+           TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+
+       /* Add the child socket directly into the accept queue */
+       inet_csk_reqsk_queue_add(sk, req, child);
+
+       /* Now finish processing the fastopen child socket. */
+       inet_csk(child)->icsk_af_ops->rebuild_header(child);
+       tcp_init_congestion_control(child);
+       tcp_mtup_init(child);
+       tcp_init_buffer_space(child);
+       tcp_init_metrics(child);
+
+       /* Queue the data carried in the SYN packet. We need to first
+        * bump skb's refcnt because the caller will attempt to free it.
+        *
+        * XXX (TFO) - we honor a zero-payload TFO request for now.
+        * (Any reason not to?)
+        */
+       if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
+               /* Don't queue the skb if there is no payload in SYN.
+                * XXX (TFO) - How about SYN+FIN?
+                */
+               tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+       } else {
+               skb = skb_get(skb);
+               skb_dst_drop(skb);
+               __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+               skb_set_owner_r(skb, child);
+               __skb_queue_tail(&child->sk_receive_queue, skb);
+               tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+       }
+       sk->sk_data_ready(sk, 0);
+       bh_unlock_sock(child);
+       sock_put(child);
+       WARN_ON(req->sk == NULL);
+       return 0;
+}
+
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_extend_values tmp_ext;
@@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        __be32 daddr = ip_hdr(skb)->daddr;
        __u32 isn = TCP_SKB_CB(skb)->when;
        bool want_cookie = false;
+       struct flowi4 fl4;
+       struct tcp_fastopen_cookie foc = { .len = -1 };
+       struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+       struct sk_buff *skb_synack;
+       int do_fastopen;
 
        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
-       tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+       tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
+           want_cookie ? NULL : &foc);
 
        if (tmp_opt.cookie_plus > 0 &&
            tmp_opt.saw_tstamp &&
@@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
-               struct flowi4 fl4;
-
                /* VJ's idea. We save last timestamp seen
                 * from the destination in peer table, when entering
                 * state TIME-WAIT, and check against it before
@@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_rsk(req)->snt_isn = isn;
        tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
-       if (tcp_v4_send_synack(sk, dst, req,
-                              (struct request_values *)&tmp_ext,
-                              skb_get_queue_mapping(skb),
-                              want_cookie) ||
-           want_cookie)
+       if (dst == NULL) {
+               dst = inet_csk_route_req(sk, &fl4, req);
+               if (dst == NULL)
+                       goto drop_and_free;
+       }
+       do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
+
+       /* We don't call tcp_v4_send_synack() directly because we need
+        * to make sure a child socket can be created successfully before
+        * sending back synack!
+        *
+        * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
+        * (or better yet, call tcp_send_synack() in the child context
+        * directly, but will have to fix bunch of other code first)
+        * after syn_recv_sock() except one will need to first fix the
+        * latter to remove its dependency on the current implementation
+        * of tcp_v4_send_synack()->tcp_select_initial_window().
+        */
+       skb_synack = tcp_make_synack(sk, dst, req,
+           (struct request_values *)&tmp_ext,
+           fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
+
+       if (skb_synack) {
+               __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
+               skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
+       } else
+               goto drop_and_free;
+
+       if (likely(!do_fastopen)) {
+               int err;
+               err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
+                    ireq->rmt_addr, ireq->opt);
+               err = net_xmit_eval(err);
+               if (err || want_cookie)
+                       goto drop_and_free;
+
+               tcp_rsk(req)->listener = NULL;
+               /* Add the request_sock to the SYN table */
+               inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+               if (fastopen_cookie_present(&foc) && foc.len != 0)
+                       NET_INC_STATS_BH(sock_net(sk),
+                           LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+       } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
+           (struct request_values *)&tmp_ext))
                goto drop_and_free;
 
-       inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
        return 0;
 
 drop_and_release:
@@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
                         tcp_cookie_values_release);
                tp->cookie_values = NULL;
        }
+       BUG_ON(tp->fastopen_rsk != NULL);
 
        /* If socket is aborted during connect operation */
        tcp_free_fastopen_req(tp);
@@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct inet_sock *inet = inet_sk(sk);
+       struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
        __be32 dest = inet->inet_daddr;
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
@@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                jiffies_to_clock_t(icsk->icsk_ack.ato),
                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
                tp->snd_cwnd,
-               tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
+               sk->sk_state == TCP_LISTEN ?
+                   (fastopenq ? fastopenq->max_qlen : 0) :
+                   (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
                len);
 }