tcp: TLP loss detection.
authorNandita Dukkipati <nanditad@google.com>
Mon, 11 Mar 2013 10:00:44 +0000 (10:00 +0000)
committerDavid S. Miller <davem@davemloft.net>
Tue, 12 Mar 2013 12:30:34 +0000 (08:30 -0400)
This is the second of the TLP patch series; it augments the basic TLP
algorithm with a loss detection scheme.

This patch implements a mechanism for loss detection when a Tail
loss probe retransmission plugs a hole thereby masking packet loss
from the sender. The loss detection algorithm relies on counting
TLP dupacks as outlined in Sec. 3 of:
http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01

The basic idea is: Sender keeps track of TLP "episode" upon
retransmission of a TLP packet. An episode ends when the sender receives
an ACK above the SND.NXT (tracked by tlp_high_seq) at the time of the
episode. We want to make sure that before the episode ends the sender
receives a "TLP dupack", indicating that the TLP retransmission was
unnecessary, so there was no loss/hole that needed plugging. If the
sender gets no TLP dupack before the end of the episode, then it reduces
ssthresh and the congestion window, because the TLP packet arriving at
the receiver probably plugged a hole.

Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/tcp.h
include/uapi/linux/snmp.h
net/ipv4/proc.c
net/ipv4/tcp_input.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c
net/ipv4/tcp_timer.c

index 01860d74555cfe747e3e00baf9e70f4e2bd1739f..763c108ee03de0654472a6333a02d23ed264e973 100644 (file)
@@ -204,6 +204,7 @@ struct tcp_sock {
                syn_data:1,     /* SYN includes data */
                syn_fastopen:1, /* SYN includes Fast Open option */
                syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
+       u32     tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
        u32     srtt;           /* smoothed round trip time << 3        */
index 290bed6b085f149c2dea0af92f8f991f7cd32b1a..e00013a1debcb521751de4b47141eb0fe64ca48d 100644 (file)
@@ -203,6 +203,7 @@ enum
        LINUX_MIB_TCPSLOWSTARTRETRANS,          /* TCPSlowStartRetrans */
        LINUX_MIB_TCPTIMEOUTS,                  /* TCPTimeouts */
        LINUX_MIB_TCPLOSSPROBES,                /* TCPLossProbes */
+       LINUX_MIB_TCPLOSSPROBERECOVERY,         /* TCPLossProbeRecovery */
        LINUX_MIB_TCPRENORECOVERYFAIL,          /* TCPRenoRecoveryFail */
        LINUX_MIB_TCPSACKRECOVERYFAIL,          /* TCPSackRecoveryFail */
        LINUX_MIB_TCPSCHEDULERFAILED,           /* TCPSchedulerFailed */
index 4c35911d935fc6c3aee485a4033f3895acd3580e..b6f2ea1748988ee4d0a418144d8cb34f055b3378 100644 (file)
@@ -225,6 +225,7 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
        SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
        SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+       SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
        SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
        SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
        SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
index b794f89ac1f2ec5824fcee4dc5d0f3fb474e354e..836d74dd01878ceb7a25bbaeda6fe171fadc8106 100644 (file)
@@ -2682,6 +2682,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
        struct tcp_sock *tp = tcp_sk(sk);
 
        tp->high_seq = tp->snd_nxt;
+       tp->tlp_high_seq = 0;
        tp->snd_cwnd_cnt = 0;
        tp->prior_cwnd = tp->snd_cwnd;
        tp->prr_delivered = 0;
@@ -3569,6 +3570,38 @@ static void tcp_send_challenge_ack(struct sock *sk)
        }
 }
 
+/* This routine deals with acks during a TLP episode.
+ * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
+                            !(flag & (FLAG_SND_UNA_ADVANCED |
+                                      FLAG_NOT_DUP | FLAG_DATA_SACKED));
+
+       /* Mark the end of TLP episode on receiving TLP dupack or when
+        * ack is after tlp_high_seq.
+        */
+       if (is_tlp_dupack) {
+               tp->tlp_high_seq = 0;
+               return;
+       }
+
+       if (after(ack, tp->tlp_high_seq)) {
+               tp->tlp_high_seq = 0;
+               /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
+               if (!(flag & FLAG_DSACKING_ACK)) {
+                       tcp_init_cwnd_reduction(sk, true);
+                       tcp_set_ca_state(sk, TCP_CA_CWR);
+                       tcp_end_cwnd_reduction(sk);
+                       tcp_set_ca_state(sk, TCP_CA_Open);
+                       NET_INC_STATS_BH(sock_net(sk),
+                                        LINUX_MIB_TCPLOSSPROBERECOVERY);
+               }
+       }
+}
+
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 {
@@ -3676,6 +3709,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                        tcp_cong_avoid(sk, ack, prior_in_flight);
        }
 
+       if (tp->tlp_high_seq)
+               tcp_process_tlp_ack(sk, ack, flag);
+
        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
                struct dst_entry *dst = __sk_dst_get(sk);
                if (dst)
@@ -3697,6 +3733,9 @@ no_queue:
         */
        if (tcp_send_head(sk))
                tcp_ack_probe(sk);
+
+       if (tp->tlp_high_seq)
+               tcp_process_tlp_ack(sk, ack, flag);
        return 1;
 
 invalid_ack:
index b83a49cc38169a654a14b94581f72eae09466c52..4bdb09fca401e2251be3c6d221a5024a1633adb6 100644 (file)
@@ -440,6 +440,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->fackets_out = 0;
                newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
                tcp_enable_early_retrans(newtp);
+               newtp->tlp_high_seq = 0;
 
                /* So many TCP implementations out there (incorrectly) count the
                 * initial SYN frame in their delayed-ACK and congestion control
index beb63dbc85f53284ed3e66141ed9675ac3551922..8e7742f0b5d27800f5be445108e3da29178ba2c2 100644 (file)
@@ -2132,6 +2132,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
  */
 void tcp_send_loss_probe(struct sock *sk)
 {
+       struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int pcount;
        int mss = tcp_current_mss(sk);
@@ -2142,6 +2143,10 @@ void tcp_send_loss_probe(struct sock *sk)
                goto rearm_timer;
        }
 
+       /* At most one outstanding TLP retransmission. */
+       if (tp->tlp_high_seq)
+               goto rearm_timer;
+
        /* Retransmit last segment. */
        skb = tcp_write_queue_tail(sk);
        if (WARN_ON(!skb))
@@ -2164,6 +2169,10 @@ void tcp_send_loss_probe(struct sock *sk)
        if (skb->len > 0)
                err = __tcp_retransmit_skb(sk, skb);
 
+       /* Record snd_nxt for loss detection. */
+       if (likely(!err))
+               tp->tlp_high_seq = tp->snd_nxt;
+
 rearm_timer:
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                  inet_csk(sk)->icsk_rto,
index ecd61d54147f1cf1f665b5f8ded32691e40de60f..eeccf795e917903e888007ccf44611a1abd95103 100644 (file)
@@ -356,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk)
 
        WARN_ON(tcp_write_queue_empty(sk));
 
+       tp->tlp_high_seq = 0;
+
        if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
            !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
                /* Receiver dastardly shrinks window. Our retransmits