tcp: Move dynamnic metrics handling into seperate file.
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / tcp_metrics.c
1 #include <linux/cache.h>
2 #include <linux/tcp.h>
3
4 #include <net/inet_connection_sock.h>
5 #include <net/sock.h>
6 #include <net/dst.h>
7 #include <net/tcp.h>
8
9 int sysctl_tcp_nometrics_save __read_mostly;
10
11 /* Save metrics learned by this TCP session.  This function is called
12  * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
13  * or goes from LAST-ACK to CLOSE.
14  */
15 void tcp_update_metrics(struct sock *sk)
16 {
17         struct tcp_sock *tp = tcp_sk(sk);
18         struct dst_entry *dst = __sk_dst_get(sk);
19
20         if (sysctl_tcp_nometrics_save)
21                 return;
22
23         if (dst && (dst->flags & DST_HOST)) {
24                 const struct inet_connection_sock *icsk = inet_csk(sk);
25                 int m;
26                 unsigned long rtt;
27
28                 dst_confirm(dst);
29
30                 if (icsk->icsk_backoff || !tp->srtt) {
31                         /* This session failed to estimate rtt. Why?
32                          * Probably, no packets returned in time.
33                          * Reset our results.
34                          */
35                         if (!(dst_metric_locked(dst, RTAX_RTT)))
36                                 dst_metric_set(dst, RTAX_RTT, 0);
37                         return;
38                 }
39
40                 rtt = dst_metric_rtt(dst, RTAX_RTT);
41                 m = rtt - tp->srtt;
42
43                 /* If newly calculated rtt larger than stored one,
44                  * store new one. Otherwise, use EWMA. Remember,
45                  * rtt overestimation is always better than underestimation.
46                  */
47                 if (!(dst_metric_locked(dst, RTAX_RTT))) {
48                         if (m <= 0)
49                                 set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
50                         else
51                                 set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
52                 }
53
54                 if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
55                         unsigned long var;
56                         if (m < 0)
57                                 m = -m;
58
59                         /* Scale deviation to rttvar fixed point */
60                         m >>= 1;
61                         if (m < tp->mdev)
62                                 m = tp->mdev;
63
64                         var = dst_metric_rtt(dst, RTAX_RTTVAR);
65                         if (m >= var)
66                                 var = m;
67                         else
68                                 var -= (var - m) >> 2;
69
70                         set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
71                 }
72
73                 if (tcp_in_initial_slowstart(tp)) {
74                         /* Slow start still did not finish. */
75                         if (dst_metric(dst, RTAX_SSTHRESH) &&
76                             !dst_metric_locked(dst, RTAX_SSTHRESH) &&
77                             (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
78                                 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
79                         if (!dst_metric_locked(dst, RTAX_CWND) &&
80                             tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
81                                 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
82                 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
83                            icsk->icsk_ca_state == TCP_CA_Open) {
84                         /* Cong. avoidance phase, cwnd is reliable. */
85                         if (!dst_metric_locked(dst, RTAX_SSTHRESH))
86                                 dst_metric_set(dst, RTAX_SSTHRESH,
87                                                max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
88                         if (!dst_metric_locked(dst, RTAX_CWND))
89                                 dst_metric_set(dst, RTAX_CWND,
90                                                (dst_metric(dst, RTAX_CWND) +
91                                                 tp->snd_cwnd) >> 1);
92                 } else {
93                         /* Else slow start did not finish, cwnd is non-sense,
94                            ssthresh may be also invalid.
95                          */
96                         if (!dst_metric_locked(dst, RTAX_CWND))
97                                 dst_metric_set(dst, RTAX_CWND,
98                                                (dst_metric(dst, RTAX_CWND) +
99                                                 tp->snd_ssthresh) >> 1);
100                         if (dst_metric(dst, RTAX_SSTHRESH) &&
101                             !dst_metric_locked(dst, RTAX_SSTHRESH) &&
102                             tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
103                                 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
104                 }
105
106                 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
107                         if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
108                             tp->reordering != sysctl_tcp_reordering)
109                                 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
110                 }
111         }
112 }
113
114 /* Initialize metrics on socket. */
115
116 void tcp_init_metrics(struct sock *sk)
117 {
118         struct tcp_sock *tp = tcp_sk(sk);
119         struct dst_entry *dst = __sk_dst_get(sk);
120
121         if (dst == NULL)
122                 goto reset;
123
124         dst_confirm(dst);
125
126         if (dst_metric_locked(dst, RTAX_CWND))
127                 tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
128         if (dst_metric(dst, RTAX_SSTHRESH)) {
129                 tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
130                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
131                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
132         } else {
133                 /* ssthresh may have been reduced unnecessarily during.
134                  * 3WHS. Restore it back to its initial default.
135                  */
136                 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
137         }
138         if (dst_metric(dst, RTAX_REORDERING) &&
139             tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
140                 tcp_disable_fack(tp);
141                 tcp_disable_early_retrans(tp);
142                 tp->reordering = dst_metric(dst, RTAX_REORDERING);
143         }
144
145         if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
146                 goto reset;
147
148         /* Initial rtt is determined from SYN,SYN-ACK.
149          * The segment is small and rtt may appear much
150          * less than real one. Use per-dst memory
151          * to make it more realistic.
152          *
153          * A bit of theory. RTT is time passed after "normal" sized packet
154          * is sent until it is ACKed. In normal circumstances sending small
155          * packets force peer to delay ACKs and calculation is correct too.
156          * The algorithm is adaptive and, provided we follow specs, it
157          * NEVER underestimate RTT. BUT! If peer tries to make some clever
158          * tricks sort of "quick acks" for time long enough to decrease RTT
159          * to low value, and then abruptly stops to do it and starts to delay
160          * ACKs, wait for troubles.
161          */
162         if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
163                 tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
164                 tp->rtt_seq = tp->snd_nxt;
165         }
166         if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
167                 tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
168                 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
169         }
170         tcp_set_rto(sk);
171 reset:
172         if (tp->srtt == 0) {
173                 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
174                  * 3WHS. This is most likely due to retransmission,
175                  * including spurious one. Reset the RTO back to 3secs
176                  * from the more aggressive 1sec to avoid more spurious
177                  * retransmission.
178                  */
179                 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
180                 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
181         }
182         /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
183          * retransmitted. In light of RFC6298 more aggressive 1sec
184          * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
185          * retransmission has occurred.
186          */
187         if (tp->total_retrans > 1)
188                 tp->snd_cwnd = 1;
189         else
190                 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
191         tp->snd_cwnd_stamp = tcp_time_stamp;
192 }