tcp: TCP Fast Open Server - support TFO listeners

author Jerry Chu <hkchu@google.com>

Fri, 31 Aug 2012 12:29:12 +0000 (12:29 +0000)

committer David S. Miller <davem@davemloft.net>

Sat, 1 Sep 2012 00:02:19 +0000 (20:02 -0400)
author Jerry Chu <hkchu@google.com>
Fri, 31 Aug 2012 12:29:12 +0000 (12:29 +0000)
committer David S. Miller <davem@davemloft.net>
Sat, 1 Sep 2012 00:02:19 +0000 (20:02 -0400)
diff --git a/include/net/request_sock.h b/include/net/request_sock.h

index c3cdd6c9f448e63b1d8094929c119cb56df9798a..b01d8dd9ee7ce99eb1bf702f00007ae456f28c16 100644 (file)
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -226,19 +226,6 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue
         return req;
  }
  
-static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
-                                                struct sock *parent)
-{
-       struct request_sock *req = reqsk_queue_remove(queue);
-       struct sock *child = req->sk;
-
-       WARN_ON(child == NULL);
-
-       sk_acceptq_removed(parent);
-       __reqsk_free(req);
-       return child;
-}
-
  static inline int reqsk_queue_removed(struct request_sock_queue *queue,
                                       struct request_sock *req)
  {
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 9f8821e3293a81904456bc151fe019bcc3e8be5e..1421b02a7905ffd3913eb33a2b8609faaea6ae05 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -424,7 +424,8 @@ extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *
                                                      const struct tcphdr *th);
  extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
                                    struct request_sock *req,
-                                  struct request_sock **prev);
+                                  struct request_sock **prev,
+                                  bool fastopen);
  extern int tcp_child_process(struct sock *parent, struct sock *child,
                              struct sk_buff *skb);
  extern bool tcp_use_frto(struct sock *sk);
@@ -478,7 +479,8 @@ extern int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
  extern int tcp_connect(struct sock *sk);
  extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                                         struct request_sock *req,
-                                       struct request_values *rvp);
+                                       struct request_values *rvp,
+                                       struct tcp_fastopen_cookie *foc);
  extern int tcp_disconnect(struct sock *sk, int flags);
  
  void tcp_connect_init(struct sock *sk);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c

index 9b570a6a33c5d8c52d777e160742dc31ec350c16..c31d9e8668c30346894adbf3be55eed4beeb1258 100644 (file)
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,7 @@
  #include <linux/random.h>
  #include <linux/slab.h>
  #include <linux/string.h>
+#include <linux/tcp.h>
  #include <linux/vmalloc.h>
  
  #include <net/request_sock.h>
@@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
                 kfree(lopt);
  }
  
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock. A corner
+ * case might also exist in tcp_v4_hnd_req() that will trigger this locking
+ * order.
+ *
+ * When a TFO req is created, it needs to sock_hold its listener to prevent
+ * the latter data structure from going away.
+ *
+ * This function also sets "treq->listener" to NULL and unreference listener
+ * socket. treq->listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+                          bool reset)
+{
+       struct sock *lsk = tcp_rsk(req)->listener;
+       struct fastopen_queue *fastopenq =
+           inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+       BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk));
+
+       tcp_sk(sk)->fastopen_rsk = NULL;
+       spin_lock_bh(&fastopenq->lock);
+       fastopenq->qlen--;
+       tcp_rsk(req)->listener = NULL;
+       if (req->sk)    /* the child socket hasn't been accepted yet */
+               goto out;
+
+       if (!reset || lsk->sk_state != TCP_LISTEN) {
+               /* If the listener has been closed don't bother with the
+                * special RST handling below.
+                */
+               spin_unlock_bh(&fastopenq->lock);
+               sock_put(lsk);
+               reqsk_free(req);
+               return;
+       }
+       /* Wait for 60secs before removing a req that has triggered RST.
+        * This is a simple defense against TFO spoofing attack - by
+        * counting the req against fastopen.max_qlen, and disabling
+        * TFO when the qlen exceeds max_qlen.
+        *
+        * For more details see CoNext'11 "TCP Fast Open" paper.
+        */
+       req->expires = jiffies + 60*HZ;
+       if (fastopenq->rskq_rst_head == NULL)
+               fastopenq->rskq_rst_head = req;
+       else
+               fastopenq->rskq_rst_tail->dl_next = req;
+
+       req->dl_next = NULL;
+       fastopenq->rskq_rst_tail = req;
+       fastopenq->qlen++;
+out:
+       spin_unlock_bh(&fastopenq->lock);
+       sock_put(lsk);
+       return;
+}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c

index 6681ccf5c3eeae5bbbca030cf2d09cf826ad335c..4f70ef0b946dba1b7efa71ad9b0d6debb2f713c9 100644 (file)
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
                 pr_err("Attempt to release alive inet socket %p\n", sk);
                 return;
         }
+       if (sk->sk_type == SOCK_STREAM) {
+               struct fastopen_queue *fastopenq =
+                       inet_csk(sk)->icsk_accept_queue.fastopenq;
+               kfree(fastopenq);
+       }
  
         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
          * we can only allow the backlog to be adjusted.
          */
         if (old_state != TCP_LISTEN) {
+               /* Check special setups for testing purpose to enable TFO w/o
+                * requiring TCP_FASTOPEN sockopt.
+                * Note that only TCP sockets (SOCK_STREAM) will reach here.
+                * Also fastopenq may already been allocated because this
+                * socket was in TCP_LISTEN state previously but was
+                * shutdown() (rather than close()).
+                */
+               if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+                   inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+                       if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+                               err = fastopen_init_queue(sk, backlog);
+                       else if ((sysctl_tcp_fastopen &
+                                 TFO_SERVER_WO_SOCKOPT2) != 0)
+                               err = fastopen_init_queue(sk,
+                                   ((uint)sysctl_tcp_fastopen) >> 16);
+                       else
+                               err = 0;
+                       if (err)
+                               goto out;
+               }
                 err = inet_csk_listen_start(sk, backlog);
                 if (err)
                         goto out;
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
  
         sock_rps_record_flow(sk2);
         WARN_ON(!((1 << sk2->sk_state) &
-                 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+                 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+                 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
  
         sock_graft(sk2, newsock);
  
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c

index 7f75f21d7b8346e0279364c511117582f3c1f342..8464b79c493f84f61c499d00e99c0eea0412a368 100644 (file)
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
  struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
  {
         struct inet_connection_sock *icsk = inet_csk(sk);
+       struct request_sock_queue *queue = &icsk->icsk_accept_queue;
         struct sock *newsk;
+       struct request_sock *req;
         int error;
  
         lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                 goto out_err;
  
         /* Find already established connection */
-       if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+       if (reqsk_queue_empty(queue)) {
                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
  
                 /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                 if (error)
                         goto out_err;
         }
-
-       newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
-       WARN_ON(newsk->sk_state == TCP_SYN_RECV);
+       req = reqsk_queue_remove(queue);
+       newsk = req->sk;
+
+       sk_acceptq_removed(sk);
+       if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
+               spin_lock_bh(&queue->fastopenq->lock);
+               if (tcp_rsk(req)->listener) {
+                       /* We are still waiting for the final ACK from 3WHS
+                        * so can't free req now. Instead, we set req->sk to
+                        * NULL to signify that the child socket is taken
+                        * so reqsk_fastopen_remove() will free the req
+                        * when 3WHS finishes (or is aborted).
+                        */
+                       req->sk = NULL;
+                       req = NULL;
+               }
+               spin_unlock_bh(&queue->fastopenq->lock);
+       }
  out:
         release_sock(sk);
+       if (req)
+               __reqsk_free(req);
         return newsk;
  out_err:
         newsk = NULL;
+       req = NULL;
         *err = error;
         goto out;
  }
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
  void inet_csk_listen_stop(struct sock *sk)
  {
         struct inet_connection_sock *icsk = inet_csk(sk);
+       struct request_sock_queue *queue = &icsk->icsk_accept_queue;
         struct request_sock *acc_req;
         struct request_sock *req;
  
         inet_csk_delete_keepalive_timer(sk);
  
         /* make all the listen_opt local to us */
-       acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+       acc_req = reqsk_queue_yank_acceptq(queue);
  
         /* Following specs, it would be better either to send FIN
          * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
          * To be honest, we are not able to make either
          * of the variants now.                 --ANK
          */
-       reqsk_queue_destroy(&icsk->icsk_accept_queue);
+       reqsk_queue_destroy(queue);
  
         while ((req = acc_req) != NULL) {
                 struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
  
                 percpu_counter_inc(sk->sk_prot->orphan_count);
  
+               if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
+                       BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+                       BUG_ON(sk != tcp_rsk(req)->listener);
+
+                       /* Paranoid, to prevent race condition if
+                        * an inbound pkt destined for child is
+                        * blocked by sock lock in tcp_v4_rcv().
+                        * Also to satisfy an assertion in
+                        * tcp_v4_destroy_sock().
+                        */
+                       tcp_sk(child)->fastopen_rsk = NULL;
+                       sock_put(sk);
+               }
                 inet_csk_destroy_sock(child);
  
                 bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
                 sk_acceptq_removed(sk);
                 __reqsk_free(req);
         }
+       if (queue->fastopenq != NULL) {
+               /* Free all the reqs queued in rskq_rst_head. */
+               spin_lock_bh(&queue->fastopenq->lock);
+               acc_req = queue->fastopenq->rskq_rst_head;
+               queue->fastopenq->rskq_rst_head = NULL;
+               spin_unlock_bh(&queue->fastopenq->lock);
+               while ((req = acc_req) != NULL) {
+                       acc_req = req->dl_next;
+                       __reqsk_free(req);
+               }
+       }
         WARN_ON(sk->sk_ack_backlog);
  }
  EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c

index 650e1528e1e669a7828ab94bb4a5fa4bf65c1186..ba48e799b031b3a45c902dff2f8690a8ce2627fa 100644 (file)
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
         ireq->tstamp_ok         = tcp_opt.saw_tstamp;
         req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
         treq->snt_synack        = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+       treq->listener          = NULL;
  
         /* We throwed the options of the initial SYN away, so we hope
          * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index 2109ff4a1dafd489fbe0e2240075432df4517374..df83d744e380f6d72dafe25bff7fe6e4382641e8 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
         if (sk->sk_shutdown & RCV_SHUTDOWN)
                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
  
-       /* Connected? */
-       if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+       /* Connected or passive Fast Open socket? */
+       if (sk->sk_state != TCP_SYN_SENT &&
+           (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
                 int target = sock_rcvlowat(sk, 0, INT_MAX);
  
                 if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
         ssize_t copied;
         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
  
-       /* Wait for a connection to finish. */
-       if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+       /* Wait for a connection to finish. One exception is TCP Fast Open
+        * (passive side) where data is allowed to be sent before a connection
+        * is fully established.
+        */
+       if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+           !tcp_passive_fastopen(sk)) {
                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
                         goto out_err;
+       }
  
         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
  
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
  
         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
  
-       /* Wait for a connection to finish. */
-       if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+       /* Wait for a connection to finish. One exception is TCP Fast Open
+        * (passive side) where data is allowed to be sent before a connection
+        * is fully established.
+        */
+       if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+           !tcp_passive_fastopen(sk)) {
                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
                         goto do_error;
+       }
  
         if (unlikely(tp->repair)) {
                 if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
                  * they look as CLOSING or LAST_ACK for Linux)
                  * Probably, I missed some more holelets.
                  *                                              --ANK
+                * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+                * in a single packet! (May consider it later but will
+                * probably need API support or TCP_CORK SYN-ACK until
+                * data is written and socket is closed.)
                  */
                 tcp_send_fin(sk);
         }
@@ -2215,8 +2230,16 @@ adjudge_to_death:
                 }
         }
  
-       if (sk->sk_state == TCP_CLOSE)
+       if (sk->sk_state == TCP_CLOSE) {
+               struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+               /* We could get here with a non-NULL req if the socket is
+                * aborted (e.g., closed with unread data) before 3WHS
+                * finishes.
+                */
+               if (req != NULL)
+                       reqsk_fastopen_remove(sk, req, false);
                 inet_csk_destroy_sock(sk);
+       }
         /* Otherwise, socket is reprieved until protocol close. */
  
  out:
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                 else
                         icsk->icsk_user_timeout = msecs_to_jiffies(val);
                 break;
+
+       case TCP_FASTOPEN:
+               if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+                   TCPF_LISTEN)))
+                       err = fastopen_init_queue(sk, val);
+               else
+                       err = -EINVAL;
+               break;
         default:
                 err = -ENOPROTOOPT;
                 break;
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
  
  void tcp_done(struct sock *sk)
  {
+       struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+
         if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
  
         tcp_set_state(sk, TCP_CLOSE);
         tcp_clear_xmit_timers(sk);
+       if (req != NULL)
+               reqsk_fastopen_remove(sk, req, false);
  
         sk->sk_shutdown = SHUTDOWN_MASK;
  
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 36f02f954ac136e0efe7c2d22aedbda0014a8c58..bb148dee1edd91035a1c6d1d6a613ff8c18fc164 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                 return -1;
  
-       skb = tcp_make_synack(sk, dst, req, rvp);
+       skb = tcp_make_synack(sk, dst, req, rvp, NULL);
  
         if (skb) {
                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
                                                        iph->saddr, iph->daddr);
         if (req)
-               return tcp_check_req(sk, skb, req, prev);
+               return tcp_check_req(sk, skb, req, prev, false);
  
         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
                         th->source, iph->daddr, th->dest, inet_iif(skb));
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c

index 6ff7f10dce9d56c2f99f0cb13dab38f69eec4619..e965319d610b20d8d20e3b3c80d8f030d6ddf237 100644 (file)
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                         newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
                 newtp->rx_opt.mss_clamp = req->mss;
                 TCP_ECN_openreq_child(newtp, req);
+               newtp->fastopen_rsk = NULL;
  
                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
         }
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
  EXPORT_SYMBOL(tcp_create_openreq_child);
  
  /*
- *     Process an incoming packet for SYN_RECV sockets represented
- *     as a request_sock.
+ * Process an incoming packet for SYN_RECV sockets represented as a
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
   */
  
  struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                            struct request_sock *req,
-                          struct request_sock **prev)
+                          struct request_sock **prev,
+                          bool fastopen)
  {
         struct tcp_options_received tmp_opt;
         const u8 *hash_location;
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
         __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
         bool paws_reject = false;
  
+       BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+
         tmp_opt.saw_tstamp = 0;
         if (th->doff > (sizeof(struct tcphdr)>>2)) {
                 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                  *
                  * Enforce "SYN-ACK" according to figure 8, figure 6
                  * of RFC793, fixed by RFC1122.
+                *
+                * Note that even if there is new data in the SYN packet
+                * they will be thrown away too.
                  */
                 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
                 return NULL;
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
          *                  sent (the segment carries an unacceptable ACK) ...
          *                  a reset is sent."
          *
-        * Invalid ACK: reset will be sent by listening socket
+        * Invalid ACK: reset will be sent by listening socket.
+        * Note that the ACK validity check for a Fast Open socket is done
+        * elsewhere and is checked directly against the child socket rather
+        * than req because user data may have been sent out.
          */
-       if ((flg & TCP_FLAG_ACK) &&
+       if ((flg & TCP_FLAG_ACK) && !fastopen &&
             (TCP_SKB_CB(skb)->ack_seq !=
              tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
                 return sk;
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
         /* RFC793: "first check sequence number". */
  
         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-                                         tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
+                                         tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
                 /* Out of window: send ACK and drop. */
                 if (!(flg & TCP_FLAG_RST))
                         req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  
         /* In sequence, PAWS is OK. */
  
-       if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
+       if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
                 req->ts_recent = tmp_opt.rcv_tsval;
  
         if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  
         /* ACK sequence verified above, just make sure ACK is
          * set.  If ACK not set, just silently drop the packet.
+        *
+        * XXX (TFO) - if we ever allow "data after SYN", the
+        * following check needs to be removed.
          */
         if (!(flg & TCP_FLAG_ACK))
                 return NULL;
  
+       /* For Fast Open no more processing is needed (sk is the
+        * child socket).
+        */
+       if (fastopen)
+               return sk;
+
         /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
         if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
             TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -706,11 +729,21 @@ listen_overflow:
         }
  
  embryonic_reset:
-       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
-       if (!(flg & TCP_FLAG_RST))
+       if (!(flg & TCP_FLAG_RST)) {
+               /* Received a bad SYN pkt - for TFO We try not to reset
+                * the local connection unless it's really necessary to
+                * avoid becoming vulnerable to outside attack aiming at
+                * resetting legit local connections.
+                */
                 req->rsk_ops->send_reset(sk, skb);
-
-       inet_csk_reqsk_queue_drop(sk, req, prev);
+       } else if (fastopen) { /* received a valid RST pkt */
+               reqsk_fastopen_remove(sk, req, true);
+               tcp_reset(sk);
+       }
+       if (!fastopen) {
+               inet_csk_reqsk_queue_drop(sk, req, prev);
+               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+       }
         return NULL;
  }
  EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
   * Queue segment on the new socket if the new socket is active,
   * otherwise we just shortcircuit this and continue with
   * the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
   */
  
  int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index d04632673a9e5f27725731e420d0997d91259aef..9383b51f3efcdcaae193ef219adc30ec53813b58 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
                                    unsigned int mss, struct sk_buff *skb,
                                    struct tcp_out_options *opts,
                                    struct tcp_md5sig_key **md5,
-                                  struct tcp_extend_values *xvp)
+                                  struct tcp_extend_values *xvp,
+                                  struct tcp_fastopen_cookie *foc)
  {
         struct inet_request_sock *ireq = inet_rsk(req);
         unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
                 if (unlikely(!ireq->tstamp_ok))
                         remaining -= TCPOLEN_SACKPERM_ALIGNED;
         }
-
+       if (foc != NULL) {
+               u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+               need = (need + 3) & ~3U;  /* Align to 32 bits */
+               if (remaining >= need) {
+                       opts->options |= OPTION_FAST_OPEN_COOKIE;
+                       opts->fastopen_cookie = foc;
+                       remaining -= need;
+               }
+       }
         /* Similar rationale to tcp_syn_options() applies here, too.
          * If the <SYN> options fit, the same options should fit now!
          */
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
   */
  struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                                 struct request_sock *req,
-                               struct request_values *rvp)
+                               struct request_values *rvp,
+                               struct tcp_fastopen_cookie *foc)
  {
         struct tcp_out_options opts;
         struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
  #endif
         TCP_SKB_CB(skb)->when = tcp_time_stamp;
         tcp_header_size = tcp_synack_options(sk, req, mss,
-                                            skb, &opts, &md5, xvp)
+                                            skb, &opts, &md5, xvp, foc)
                         + sizeof(*th);
  
         skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
         }
  
         th->seq = htonl(TCP_SKB_CB(skb)->seq);
-       th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
+       /* XXX data is queued and acked as is. No buffer/window check */
+       th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
  
         /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
         th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

index b774a03bd1dcc1ccafa245a892ac0b312511a900..fc04711e80c89dd0dc92ff1027efaa7324b218c3 100644 (file)
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -304,6 +304,35 @@ static void tcp_probe_timer(struct sock *sk)
         }
  }
  
+/*
+ *     Timer for Fast Open socket to retransmit SYNACK. Note that the
+ *     sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int max_retries = icsk->icsk_syn_retries ? :
+           sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+       struct request_sock *req;
+
+       req = tcp_sk(sk)->fastopen_rsk;
+       req->rsk_ops->syn_ack_timeout(sk, req);
+
+       if (req->retrans >= max_retries) {
+               tcp_write_err(sk);
+               return;
+       }
+       /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+        * returned from rtx_syn_ack() to make it more persistent like
+        * regular retransmit because if the child socket has been accepted
+        * it's not good to give up too easily.
+        */
+       req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+       req->retrans++;
+       inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                         TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
+}
+
  /*
   *     The TCP retransmit timer.
   */
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
                 tcp_resume_early_retransmit(sk);
                 return;
         }
-
+       if (tp->fastopen_rsk) {
+               BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                   sk->sk_state != TCP_FIN_WAIT1);
+               tcp_fastopen_synack_timer(sk);
+               /* Before we receive ACK to our SYN-ACK don't retransmit
+                * anything else (e.g., data or FIN segments).
+                */
+               return;
+       }
         if (!tp->packets_out)
                 goto out;
  
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c

index bb46061c813a45c1ef859f5c60c2c5e45773ed29..182ab9a85d6cb5c0ad88e89bab6f5c0d22c380e8 100644 (file)
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -190,6 +190,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
         ireq = inet_rsk(req);
         ireq6 = inet6_rsk(req);
         treq = tcp_rsk(req);
+       treq->listener = NULL;
  
         if (security_inet_conn_request(sk, skb, req))
                 goto out_free;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

index f99b81d53cca78f7a34b28f98b1886c93800bf89..09078b9bc6f6ff1a7622883a5a66751ba3d25f42 100644 (file)
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -475,7 +475,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
         if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
                 goto done;
  
-       skb = tcp_make_synack(sk, dst, req, rvp);
+       skb = tcp_make_synack(sk, dst, req, rvp, NULL);
  
         if (skb) {
                 __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
@@ -987,7 +987,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
                                    &ipv6_hdr(skb)->saddr,
                                    &ipv6_hdr(skb)->daddr, inet6_iif(skb));
         if (req)
-               return tcp_check_req(sk, skb, req, prev);
+               return tcp_check_req(sk, skb, req, prev, false);
  
         nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
                         &ipv6_hdr(skb)->saddr, th->source,
@@ -1179,6 +1179,7 @@ have_isn:
             want_cookie)
                 goto drop_and_free;
  
+       tcp_rsk(req)->listener = NULL;
         inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
         return 0;
author	Jerry Chu <hkchu@google.com>
	Fri, 31 Aug 2012 12:29:12 +0000 (12:29 +0000)
committer	David S. Miller <davem@davemloft.net>
	Sat, 1 Sep 2012 00:02:19 +0000 (20:02 -0400)
include/net/request_sock.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/core/request_sock.c		patch \| blob \| history
net/ipv4/af_inet.c		patch \| blob \| history
net/ipv4/inet_connection_sock.c		patch \| blob \| history
net/ipv4/syncookies.c		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history
net/ipv4/tcp_minisocks.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history
net/ipv4/tcp_timer.c		patch \| blob \| history
net/ipv6/syncookies.c		patch \| blob \| history
net/ipv6/tcp_ipv6.c		patch \| blob \| history