tcp: TSO packets automatic sizing

author Eric Dumazet <edumazet@google.com>

Tue, 27 Aug 2013 12:46:32 +0000 (05:46 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 4 Nov 2013 12:30:59 +0000 (04:30 -0800)
author Eric Dumazet <edumazet@google.com>
Tue, 27 Aug 2013 12:46:32 +0000 (05:46 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 4 Nov 2013 12:30:59 +0000 (04:30 -0800)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt

index 3458d6343e01de0e66f0d3f2e09efc61bab3b886..3994f0bbeeb60ff1352d878cd2ce6c2b02f05f5e 100644 (file)
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -478,6 +478,15 @@ tcp_syn_retries - INTEGER
  tcp_timestamps - BOOLEAN
         Enable timestamps as defined in RFC1323.
  
+tcp_min_tso_segs - INTEGER
+       Minimal number of segments per TSO frame.
+       Since linux-3.12, TCP does an automatic sizing of TSO frames,
+       depending on flow rate, instead of filling 64Kbytes packets.
+       For specific usages, it's possible to force TCP to build big
+       TSO frames. Note that TCP stack might split too big TSO packets
+       if available window is too small.
+       Default: 2
+
  tcp_tso_win_divisor - INTEGER
         This allows control over what percentage of the congestion window
         can be consumed by a single TSO frame.
diff --git a/include/net/sock.h b/include/net/sock.h

index 66772cf8c3c528c86104684ca3cf1ac8c5ccfd85..cec4c723db9a8b632e6f837235137edc42aa50e6 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -230,6 +230,7 @@ struct cg_proto;
    *    @sk_wmem_queued: persistent queue size
    *    @sk_forward_alloc: space allocated forward
    *    @sk_allocation: allocation mode
+  *    @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
    *    @sk_sndbuf: size of send buffer in bytes
    *    @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
    *               %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -355,6 +356,7 @@ struct sock {
         kmemcheck_bitfield_end(flags);
         int                     sk_wmem_queued;
         gfp_t                   sk_allocation;
+       u32                     sk_pacing_rate; /* bytes per second */
         netdev_features_t       sk_route_caps;
         netdev_features_t       sk_route_nocaps;
         int                     sk_gso_type;
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 5bba80fbd1d9d92738d115cbc870bf0612cb0305..3fc77e90624a59a29172832417efc393b9816b0d 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -287,6 +287,7 @@ extern int sysctl_tcp_thin_dupack;
  extern int sysctl_tcp_early_retrans;
  extern int sysctl_tcp_limit_output_bytes;
  extern int sysctl_tcp_challenge_ack_limit;
+extern int sysctl_tcp_min_tso_segs;
  
  extern atomic_long_t tcp_memory_allocated;
  extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/core/sock.c b/net/core/sock.c

index d6d024cfaaafd0575723d41004dd8d18816b55b0..6565431b0e6dacef754d5066e28d0502900dc840 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2271,6 +2271,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
  
         sk->sk_stamp = ktime_set(-1L, 0);
  
+       sk->sk_pacing_rate = ~0U;
         /*
          * Before updating sk_refcnt, we must commit prior changes to memory
          * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 3f25e75ae692e909592bdfaf0ce0a60cd0ee5697..90b26beb84d41c614767d77188d0f3426e9a9658 100644 (file)
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
  static int zero;
  static int one = 1;
  static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
  static int tcp_retr1_max = 255;
  static int ip_local_port_range_min[] = { 1, 1 };
  static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -752,6 +753,15 @@ static struct ctl_table ipv4_table[] = {
                 .extra1         = &zero,
                 .extra2         = &four,
         },
+       {
+               .procname       = "tcp_min_tso_segs",
+               .data           = &sysctl_tcp_min_tso_segs,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &gso_max_segs,
+       },
         {
                 .procname       = "udp_mem",
                 .data           = &sysctl_udp_mem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index 2b1b57f213b2c47adb4b132ceb2c926438e25e5c..c888abf5a728d2a6252be22aebe3abb52f33b5dd 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,6 +282,8 @@
  
  int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
  
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
  struct percpu_counter tcp_orphan_count;
  EXPORT_SYMBOL_GPL(tcp_orphan_count);
  
@@ -786,12 +788,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
         xmit_size_goal = mss_now;
  
         if (large_allowed && sk_can_gso(sk)) {
-               xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-                                 inet_csk(sk)->icsk_af_ops->net_header_len -
-                                 inet_csk(sk)->icsk_ext_hdr_len -
-                                 tp->tcp_header_len);
+               u32 gso_size, hlen;
+
+               /* Maybe we should/could use sk->sk_prot->max_header here ? */
+               hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+                      inet_csk(sk)->icsk_ext_hdr_len +
+                      tp->tcp_header_len;
+
+               /* Goal is to send at least one packet per ms,
+                * not one big TSO packet every 100 ms.
+                * This preserves ACK clocking and is consistent
+                * with tcp_tso_should_defer() heuristic.
+                */
+               gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+               gso_size = max_t(u32, gso_size,
+                                sysctl_tcp_min_tso_segs * mss_now);
+
+               xmit_size_goal = min_t(u32, gso_size,
+                                      sk->sk_gso_max_size - 1 - hlen);
  
-               /* TSQ : try to have two TSO segments in flight */
+               /* TSQ : try to have at least two segments in flight
+                * (one in NIC TX ring, another in Qdisc)
+                */
                 xmit_size_goal = min_t(u32, xmit_size_goal,
                                        sysctl_tcp_limit_output_bytes >> 1);
  
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 4b75aad14b04a0c2dbf899d9d7993cf125002240..70883b87bc5d91caf166a82e34d8410fa7a38b2a 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -699,6 +699,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
         }
  }
  
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       u64 rate;
+
+       /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+       rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+
+       rate *= max(tp->snd_cwnd, tp->packets_out);
+
+       /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+        * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+        * We probably need usec resolution in the future.
+        * Note: This also takes care of possible srtt=0 case,
+        * when tcp_rtt_estimator() was not yet called.
+        */
+       if (tp->srtt > 8 + 2)
+               do_div(rate, tp->srtt);
+
+       sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
+
  /* Calculate rto without backoff.  This is the second half of Van Jacobson's
   * routine referred to above.
   */
@@ -3330,7 +3358,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         u32 ack_seq = TCP_SKB_CB(skb)->seq;
         u32 ack = TCP_SKB_CB(skb)->ack_seq;
         bool is_dupack = false;
-       u32 prior_in_flight;
+       u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
         u32 prior_fackets;
         int prior_packets = tp->packets_out;
         int prior_sacked = tp->sacked_out;
@@ -3438,6 +3466,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
         if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                 tcp_schedule_loss_probe(sk);
+       if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+               tcp_update_pacing_rate(sk);
         return 1;
  
  no_queue:
@@ -5736,6 +5766,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                 } else
                                         tcp_init_metrics(sk);
  
+                               tcp_update_pacing_rate(sk);
+
                                 /* Prevent spurious tcp_cwnd_restart() on
                                  * first data packet.
                                  */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 0145ce7e609881704e78718c1383870856fce326..400b811f5c06baac04bdf76092f9027a9563a24f 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1623,7 +1623,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
  
         /* If a full-sized TSO skb can be sent, do it. */
         if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
-                          sk->sk_gso_max_segs * tp->mss_cache))
+                          tp->xmit_size_goal_segs * tp->mss_cache))
                 goto send_now;
  
         /* Middle in queue won't get any more data, full sendable already? */
author	Eric Dumazet <edumazet@google.com>
	Tue, 27 Aug 2013 12:46:32 +0000 (05:46 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 4 Nov 2013 12:30:59 +0000 (04:30 -0800)
Documentation/networking/ip-sysctl.txt		patch \| blob \| history
include/net/sock.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/core/sock.c		patch \| blob \| history
net/ipv4/sysctl_net_ipv4.c		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv4/tcp_input.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history