tcp: Revert 'process defer accept as established' changes.
authorDavid S. Miller <davem@davemloft.net>
Thu, 12 Jun 2008 23:31:35 +0000 (16:31 -0700)
committerDavid S. Miller <davem@davemloft.net>
Thu, 12 Jun 2008 23:34:35 +0000 (16:34 -0700)
This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87
("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and
the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38
("tcp: Fix slab corruption with ipv6 and tcp6fuzz").

This change causes several problems, first reported by Ingo Molnar
as a distcc-over-loopback regression where connections were getting
stuck.

Ilpo Järvinen first spotted the locking problems.  The new function
added by this code, tcp_defer_accept_check(), only has the
child socket locked, yet it is modifying state of the parent
listening socket.

Fixing that is non-trivial at best, because we can't simply just grab
the parent listening socket lock at this point, because it would
create an ABBA deadlock.  The normal ordering is parent listening
socket --> child socket, but this code path would require the
reverse lock ordering.

Next is a problem noticed by Vitaliy Gusev, he noted:

----------------------------------------
>--- a/net/ipv4/tcp_timer.c
>+++ b/net/ipv4/tcp_timer.c
>@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data)
>  goto death;
>  }
>
>+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
>+ tcp_send_active_reset(sk, GFP_ATOMIC);
>+ goto death;

Here socket sk is not attached to listening socket's request queue. tcp_done()
will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should
release this sk) as socket is not DEAD. Therefore socket sk will be lost for
freeing.
----------------------------------------

Finally, Alexey Kuznetsov argues that there might not even be any
real value or advantage to these new semantics even if we fix all
of the bugs:

----------------------------------------
Hiding from accept() sockets with only out-of-order data only
is the only thing which is impossible with old approach. Is this really
so valuable? My opinion: no, this is nothing but a new loophole
to consume memory without control.
----------------------------------------

So revert this thing for now.

Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/tcp.h
include/net/request_sock.h
include/net/tcp.h
net/ipv4/inet_connection_sock.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_timer.c

index 18e62e3d406fe86a83e03a8a34257b54fd069ba7..b31b6b74aa28bf4e4dd3aa45d976e34c0c5f968d 100644 (file)
@@ -239,11 +239,6 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
        return (struct tcp_request_sock *)req;
 }
 
-struct tcp_deferred_accept_info {
-       struct sock *listen_sk;
-       struct request_sock *request;
-};
-
 struct tcp_sock {
        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock     inet_conn;
@@ -379,8 +374,6 @@ struct tcp_sock {
        unsigned int            keepalive_intvl;  /* time interval between keep alive probes */
        int                     linger2;
 
-       struct tcp_deferred_accept_info defer_tcp_accept;
-
        unsigned long last_synq_overflow; 
 
        u32     tso_deferred;
index b220b5f624de2cb97c60ba5c9453633f8d24be8c..0c96e7bed5db39e7eff4718353697ce52e06df96 100644 (file)
@@ -115,8 +115,8 @@ struct request_sock_queue {
        struct request_sock     *rskq_accept_head;
        struct request_sock     *rskq_accept_tail;
        rwlock_t                syn_wait_lock;
-       u16                     rskq_defer_accept;
-       /* 2 bytes hole, try to pack */
+       u                     rskq_defer_accept;
+       /* 3 bytes hole, try to pack */
        struct listen_sock      *listen_opt;
 };
 
index d448310c82c1ac8b8c718f2a2140c58737511a74..cf54034019d9eafee928bf10b4180cea2c091ed0 100644 (file)
@@ -139,7 +139,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define MAX_TCP_KEEPINTVL      32767
 #define MAX_TCP_KEEPCNT                127
 #define MAX_TCP_SYNCNT         127
-#define MAX_TCP_ACCEPT_DEFERRED 65535
 
 #define TCP_SYNQ_INTERVAL      (HZ/5)  /* Period of SYNACK timer */
 
index 828ea211ff211b204185b52b20d1a719912d5699..045e799d3e1db4e5cc58f72c8a35fc3f05d2eb87 100644 (file)
@@ -419,7 +419,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
        struct inet_connection_sock *icsk = inet_csk(parent);
        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct listen_sock *lopt = queue->listen_opt;
-       int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+       int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+       int thresh = max_retries;
        unsigned long now = jiffies;
        struct request_sock **reqp, *req;
        int i, budget;
@@ -455,6 +456,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
                }
        }
 
+       if (queue->rskq_defer_accept)
+               max_retries = queue->rskq_defer_accept;
+
        budget = 2 * (lopt->nr_table_entries / (timeout / interval));
        i = lopt->clock_hand;
 
@@ -462,8 +466,9 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
                reqp=&lopt->syn_table[i];
                while ((req = *reqp) != NULL) {
                        if (time_after_eq(now, req->expires)) {
-                               if (req->retrans < thresh &&
-                                   !req->rsk_ops->rtx_syn_ack(parent, req)) {
+                               if ((req->retrans < (inet_rsk(req)->acked ? max_retries : thresh)) &&
+                                   (inet_rsk(req)->acked ||
+                                    !req->rsk_ops->rtx_syn_ack(parent, req))) {
                                        unsigned long timeo;
 
                                        if (req->retrans++ == 0)
index ab66683b804343fc852b93220d41e83d80275512..fc54a48fde1e6a6b07a157b1cc551427d226099b 100644 (file)
@@ -2112,12 +2112,15 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                break;
 
        case TCP_DEFER_ACCEPT:
-               if (val < 0) {
-                       err = -EINVAL;
-               } else {
-                       if (val > MAX_TCP_ACCEPT_DEFERRED)
-                               val = MAX_TCP_ACCEPT_DEFERRED;
-                       icsk->icsk_accept_queue.rskq_defer_accept = val;
+               icsk->icsk_accept_queue.rskq_defer_accept = 0;
+               if (val > 0) {
+                       /* Translate value in seconds to number of
+                        * retransmits */
+                       while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
+                              val > ((TCP_TIMEOUT_INIT / HZ) <<
+                                      icsk->icsk_accept_queue.rskq_defer_accept))
+                               icsk->icsk_accept_queue.rskq_defer_accept++;
+                       icsk->icsk_accept_queue.rskq_defer_accept++;
                }
                break;
 
@@ -2299,7 +2302,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                        val = (val ? : sysctl_tcp_fin_timeout) / HZ;
                break;
        case TCP_DEFER_ACCEPT:
-               val = icsk->icsk_accept_queue.rskq_defer_accept;
+               val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+                       ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
                break;
        case TCP_WINDOW_CLAMP:
                val = tp->window_clamp;
index eba873e9b560e0725b3fc6599c60eadc8c78eac4..cad73b7dfef07798ddc0b8debba24c9aad94c53b 100644 (file)
@@ -4541,49 +4541,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
        }
 }
 
-static int tcp_defer_accept_check(struct sock *sk)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-
-       if (tp->defer_tcp_accept.request) {
-               int queued_data =  tp->rcv_nxt - tp->copied_seq;
-               int hasfin =  !skb_queue_empty(&sk->sk_receive_queue) ?
-                       tcp_hdr((struct sk_buff *)
-                               sk->sk_receive_queue.prev)->fin : 0;
-
-               if (queued_data && hasfin)
-                       queued_data--;
-
-               if (queued_data &&
-                   tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
-                       if (sock_flag(sk, SOCK_KEEPOPEN)) {
-                               inet_csk_reset_keepalive_timer(sk,
-                                                              keepalive_time_when(tp));
-                       } else {
-                               inet_csk_delete_keepalive_timer(sk);
-                       }
-
-                       inet_csk_reqsk_queue_add(
-                               tp->defer_tcp_accept.listen_sk,
-                               tp->defer_tcp_accept.request,
-                               sk);
-
-                       tp->defer_tcp_accept.listen_sk->sk_data_ready(
-                               tp->defer_tcp_accept.listen_sk, 0);
-
-                       sock_put(tp->defer_tcp_accept.listen_sk);
-                       sock_put(sk);
-                       tp->defer_tcp_accept.listen_sk = NULL;
-                       tp->defer_tcp_accept.request = NULL;
-               } else if (hasfin ||
-                          tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) {
-                       tcp_reset(sk);
-                       return -1;
-               }
-       }
-       return 0;
-}
-
 static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -4944,8 +4901,6 @@ step5:
 
        tcp_data_snd_check(sk);
        tcp_ack_snd_check(sk);
-
-       tcp_defer_accept_check(sk);
        return 0;
 
 csum_error:
index 4f8485c67d1a6e4221fbc651593f2c047da4414d..97a230026e13af4243e458a5a915c90561b8ac28 100644 (file)
@@ -1918,14 +1918,6 @@ int tcp_v4_destroy_sock(struct sock *sk)
                sk->sk_sndmsg_page = NULL;
        }
 
-       if (tp->defer_tcp_accept.request) {
-               reqsk_free(tp->defer_tcp_accept.request);
-               sock_put(tp->defer_tcp_accept.listen_sk);
-               sock_put(sk);
-               tp->defer_tcp_accept.listen_sk = NULL;
-               tp->defer_tcp_accept.request = NULL;
-       }
-
        atomic_dec(&tcp_sockets_allocated);
 
        return 0;
index 019c8c16e5ccba1ae93d50e01d8439d416eef26a..8245247a6ceb732307569a9d4ab393e0de535c90 100644 (file)
@@ -571,8 +571,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
           does sequence test, SYN is truncated, and thus we consider
           it a bare ACK.
 
-          Both ends (listening sockets) accept the new incoming
-          connection and try to talk to each other. 8-)
+          If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+          bare ACK.  Otherwise, we create an established connection.  Both
+          ends (listening sockets) accept the new incoming connection and try
+          to talk to each other. 8-)
 
           Note: This case is both harmless, and rare.  Possibility is about the
           same as us discovering intelligent life on another plant tomorrow.
@@ -640,6 +642,13 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                if (!(flg & TCP_FLAG_ACK))
                        return NULL;
 
+               /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+               if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+                   TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+                       inet_rsk(req)->acked = 1;
+                       return NULL;
+               }
+
                /* OK, ACK is valid, create big socket and
                 * feed this segment to it. It will repeat all
                 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
@@ -678,24 +687,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
                inet_csk_reqsk_queue_unlink(sk, req, prev);
                inet_csk_reqsk_queue_removed(sk, req);
 
-               if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
-                   TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
-
-                       /* the accept queue handling is done is est recv slow
-                        * path so lets make sure to start there
-                        */
-                       tcp_sk(child)->pred_flags = 0;
-                       sock_hold(sk);
-                       sock_hold(child);
-                       tcp_sk(child)->defer_tcp_accept.listen_sk = sk;
-                       tcp_sk(child)->defer_tcp_accept.request = req;
-
-                       inet_csk_reset_keepalive_timer(child,
-                                                      inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ);
-               } else {
-                       inet_csk_reqsk_queue_add(sk, req, child);
-               }
-
+               inet_csk_reqsk_queue_add(sk, req, child);
                return child;
 
        listen_overflow:
index 4de68cf5f2aad61322bbb58dc6d6dfb328055acf..63ed9d6830e7b80f77f3619168756742f6ef3bc9 100644 (file)
@@ -489,11 +489,6 @@ static void tcp_keepalive_timer (unsigned long data)
                goto death;
        }
 
-       if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
-               tcp_send_active_reset(sk, GFP_ATOMIC);
-               goto death;
-       }
-
        if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
                goto out;