Merge branch 'linux-linaro-lsk-v4.4' into linux-linaro-lsk-v4.4-android
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103                                           ip_hdr(skb)->saddr,
104                                           tcp_hdr(skb)->dest,
105                                           tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112
113         /* With PAWS, it is safe from the viewpoint
114            of data integrity. Even without PAWS it is safe provided sequence
115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117            Actually, the idea is close to VJ's one, only timestamp cache is
118            held not per host, but per port pair and TW bucket is used as state
119            holder.
120
121            If TW bucket has been already destroyed we fall back to VJ's scheme
122            and use initial timestamp retrieved from peer table.
123          */
124         if (tcptw->tw_ts_recent_stamp &&
125             (!twp || (sysctl_tcp_tw_reuse &&
126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128                 if (tp->write_seq == 0)
129                         tp->write_seq = 1;
130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132                 sock_hold(sktw);
133                 return 1;
134         }
135
136         return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144         struct inet_sock *inet = inet_sk(sk);
145         struct tcp_sock *tp = tcp_sk(sk);
146         __be16 orig_sport, orig_dport;
147         __be32 daddr, nexthop;
148         struct flowi4 *fl4;
149         struct rtable *rt;
150         int err;
151         struct ip_options_rcu *inet_opt;
152
153         if (addr_len < sizeof(struct sockaddr_in))
154                 return -EINVAL;
155
156         if (usin->sin_family != AF_INET)
157                 return -EAFNOSUPPORT;
158
159         nexthop = daddr = usin->sin_addr.s_addr;
160         inet_opt = rcu_dereference_protected(inet->inet_opt,
161                                              sock_owned_by_user(sk));
162         if (inet_opt && inet_opt->opt.srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet_opt->opt.faddr;
166         }
167
168         orig_sport = inet->inet_sport;
169         orig_dport = usin->sin_port;
170         fl4 = &inet->cork.fl.u.ip4;
171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                               IPPROTO_TCP,
174                               orig_sport, orig_dport, sk);
175         if (IS_ERR(rt)) {
176                 err = PTR_ERR(rt);
177                 if (err == -ENETUNREACH)
178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179                 return err;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet_opt || !inet_opt->opt.srr)
188                 daddr = fl4->daddr;
189
190         if (!inet->inet_saddr)
191                 inet->inet_saddr = fl4->saddr;
192         sk_rcv_saddr_set(sk, inet->inet_saddr);
193
194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 if (likely(!tp->repair))
199                         tp->write_seq      = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
205
206         inet->inet_dport = usin->sin_port;
207         sk_daddr_set(sk, daddr);
208
209         inet_csk(sk)->icsk_ext_hdr_len = 0;
210         if (inet_opt)
211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212
213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214
215         /* Socket identity is still unknown (sport may be zero).
216          * However we set state to SYN-SENT and not releasing socket
217          * lock select source port, enter ourselves into the hash tables and
218          * complete initialization after this.
219          */
220         tcp_set_state(sk, TCP_SYN_SENT);
221         err = inet_hash_connect(&tcp_death_row, sk);
222         if (err)
223                 goto failure;
224
225         sk_set_txhash(sk);
226
227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228                                inet->inet_sport, inet->inet_dport, sk);
229         if (IS_ERR(rt)) {
230                 err = PTR_ERR(rt);
231                 rt = NULL;
232                 goto failure;
233         }
234         /* OK, now commit destination to socket.  */
235         sk->sk_gso_type = SKB_GSO_TCPV4;
236         sk_setup_caps(sk, &rt->dst);
237
238         if (!tp->write_seq && likely(!tp->repair))
239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240                                                            inet->inet_daddr,
241                                                            inet->inet_sport,
242                                                            usin->sin_port);
243
244         inet->inet_id = tp->write_seq ^ jiffies;
245
246         err = tcp_connect(sk);
247
248         rt = NULL;
249         if (err)
250                 goto failure;
251
252         return 0;
253
254 failure:
255         /*
256          * This unhashes the socket and releases the local port,
257          * if necessary.
258          */
259         tcp_set_state(sk, TCP_CLOSE);
260         ip_rt_put(rt);
261         sk->sk_route_caps = 0;
262         inet->inet_dport = 0;
263         return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274         struct dst_entry *dst;
275         struct inet_sock *inet = inet_sk(sk);
276         u32 mtu = tcp_sk(sk)->mtu_info;
277
278         dst = inet_csk_update_pmtu(sk, mtu);
279         if (!dst)
280                 return;
281
282         /* Something is about to be wrong... Remember soft error
283          * for the case, if this connection will not able to recover.
284          */
285         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286                 sk->sk_err_soft = EMSGSIZE;
287
288         mtu = dst_mtu(dst);
289
290         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291             ip_sk_accept_pmtu(sk) &&
292             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293                 tcp_sync_mss(sk, mtu);
294
295                 /* Resend the TCP packet because it's
296                  * clear that the old packet has been
297                  * dropped. This is the new "fast" path mtu
298                  * discovery.
299                  */
300                 tcp_simple_retransmit(sk);
301         } /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307         struct dst_entry *dst = __sk_dst_check(sk, 0);
308
309         if (dst)
310                 dst->ops->redirect(dst, sk, skb);
311 }
312
313
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
316 {
317         struct request_sock *req = inet_reqsk(sk);
318         struct net *net = sock_net(sk);
319
320         /* ICMPs are not backlogged, hence we cannot get
321          * an established socket here.
322          */
323         if (seq != tcp_rsk(req)->snt_isn) {
324                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
325         } else if (abort) {
326                 /*
327                  * Still in SYN_RECV, just remove it silently.
328                  * There is no good way to pass the error to the newly
329                  * created socket, and POSIX does not want network
330                  * errors returned from accept().
331                  */
332                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
333                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
334         }
335         reqsk_put(req);
336 }
337 EXPORT_SYMBOL(tcp_req_err);
338
339 /*
340  * This routine is called by the ICMP module when it gets some
341  * sort of error condition.  If err < 0 then the socket should
342  * be closed and the error returned to the user.  If err > 0
343  * it's just the icmp type << 8 | icmp code.  After adjustment
344  * header points to the first 8 bytes of the tcp header.  We need
345  * to find the appropriate port.
346  *
347  * The locking strategy used here is very "optimistic". When
348  * someone else accesses the socket the ICMP is just dropped
349  * and for some paths there is no check at all.
350  * A more general error queue to queue errors for later handling
351  * is probably better.
352  *
353  */
354
355 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
356 {
357         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
358         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
359         struct inet_connection_sock *icsk;
360         struct tcp_sock *tp;
361         struct inet_sock *inet;
362         const int type = icmp_hdr(icmp_skb)->type;
363         const int code = icmp_hdr(icmp_skb)->code;
364         struct sock *sk;
365         struct sk_buff *skb;
366         struct request_sock *fastopen;
367         __u32 seq, snd_una;
368         __u32 remaining;
369         int err;
370         struct net *net = dev_net(icmp_skb->dev);
371
372         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
373                                        th->dest, iph->saddr, ntohs(th->source),
374                                        inet_iif(icmp_skb));
375         if (!sk) {
376                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
377                 return;
378         }
379         if (sk->sk_state == TCP_TIME_WAIT) {
380                 inet_twsk_put(inet_twsk(sk));
381                 return;
382         }
383         seq = ntohl(th->seq);
384         if (sk->sk_state == TCP_NEW_SYN_RECV)
385                 return tcp_req_err(sk, seq,
386                                   type == ICMP_PARAMETERPROB ||
387                                   type == ICMP_TIME_EXCEEDED ||
388                                   (type == ICMP_DEST_UNREACH &&
389                                    (code == ICMP_NET_UNREACH ||
390                                     code == ICMP_HOST_UNREACH)));
391
392         bh_lock_sock(sk);
393         /* If too many ICMPs get dropped on busy
394          * servers this needs to be solved differently.
395          * We do take care of PMTU discovery (RFC1191) special case :
396          * we can receive locally generated ICMP messages while socket is held.
397          */
398         if (sock_owned_by_user(sk)) {
399                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
400                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
401         }
402         if (sk->sk_state == TCP_CLOSE)
403                 goto out;
404
405         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
406                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
407                 goto out;
408         }
409
410         icsk = inet_csk(sk);
411         tp = tcp_sk(sk);
412         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
413         fastopen = tp->fastopen_rsk;
414         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
415         if (sk->sk_state != TCP_LISTEN &&
416             !between(seq, snd_una, tp->snd_nxt)) {
417                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
418                 goto out;
419         }
420
421         switch (type) {
422         case ICMP_REDIRECT:
423                 do_redirect(icmp_skb, sk);
424                 goto out;
425         case ICMP_SOURCE_QUENCH:
426                 /* Just silently ignore these. */
427                 goto out;
428         case ICMP_PARAMETERPROB:
429                 err = EPROTO;
430                 break;
431         case ICMP_DEST_UNREACH:
432                 if (code > NR_ICMP_UNREACH)
433                         goto out;
434
435                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
436                         /* We are not interested in TCP_LISTEN and open_requests
437                          * (SYN-ACKs send out by Linux are always <576bytes so
438                          * they should go through unfragmented).
439                          */
440                         if (sk->sk_state == TCP_LISTEN)
441                                 goto out;
442
443                         tp->mtu_info = info;
444                         if (!sock_owned_by_user(sk)) {
445                                 tcp_v4_mtu_reduced(sk);
446                         } else {
447                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
448                                         sock_hold(sk);
449                         }
450                         goto out;
451                 }
452
453                 err = icmp_err_convert[code].errno;
454                 /* check if icmp_skb allows revert of backoff
455                  * (see draft-zimmermann-tcp-lcd) */
456                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
457                         break;
458                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
459                     !icsk->icsk_backoff || fastopen)
460                         break;
461
462                 if (sock_owned_by_user(sk))
463                         break;
464
465                 icsk->icsk_backoff--;
466                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
467                                                TCP_TIMEOUT_INIT;
468                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
469
470                 skb = tcp_write_queue_head(sk);
471                 BUG_ON(!skb);
472
473                 remaining = icsk->icsk_rto -
474                             min(icsk->icsk_rto,
475                                 tcp_time_stamp - tcp_skb_timestamp(skb));
476
477                 if (remaining) {
478                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
479                                                   remaining, TCP_RTO_MAX);
480                 } else {
481                         /* RTO revert clocked out retransmission.
482                          * Will retransmit now */
483                         tcp_retransmit_timer(sk);
484                 }
485
486                 break;
487         case ICMP_TIME_EXCEEDED:
488                 err = EHOSTUNREACH;
489                 break;
490         default:
491                 goto out;
492         }
493
494         switch (sk->sk_state) {
495         case TCP_SYN_SENT:
496         case TCP_SYN_RECV:
497                 /* Only in fast or simultaneous open. If a fast open socket is
498                  * is already accepted it is treated as a connected one below.
499                  */
500                 if (fastopen && !fastopen->sk)
501                         break;
502
503                 if (!sock_owned_by_user(sk)) {
504                         sk->sk_err = err;
505
506                         sk->sk_error_report(sk);
507
508                         tcp_done(sk);
509                 } else {
510                         sk->sk_err_soft = err;
511                 }
512                 goto out;
513         }
514
515         /* If we've already connected we will keep trying
516          * until we time out, or the user gives up.
517          *
518          * rfc1122 4.2.3.9 allows to consider as hard errors
519          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
520          * but it is obsoleted by pmtu discovery).
521          *
522          * Note, that in modern internet, where routing is unreliable
523          * and in each dark corner broken firewalls sit, sending random
524          * errors ordered by their masters even this two messages finally lose
525          * their original sense (even Linux sends invalid PORT_UNREACHs)
526          *
527          * Now we are in compliance with RFCs.
528          *                                                      --ANK (980905)
529          */
530
531         inet = inet_sk(sk);
532         if (!sock_owned_by_user(sk) && inet->recverr) {
533                 sk->sk_err = err;
534                 sk->sk_error_report(sk);
535         } else  { /* Only an error on timeout */
536                 sk->sk_err_soft = err;
537         }
538
539 out:
540         bh_unlock_sock(sk);
541         sock_put(sk);
542 }
543
544 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
545 {
546         struct tcphdr *th = tcp_hdr(skb);
547
548         if (skb->ip_summed == CHECKSUM_PARTIAL) {
549                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
550                 skb->csum_start = skb_transport_header(skb) - skb->head;
551                 skb->csum_offset = offsetof(struct tcphdr, check);
552         } else {
553                 th->check = tcp_v4_check(skb->len, saddr, daddr,
554                                          csum_partial(th,
555                                                       th->doff << 2,
556                                                       skb->csum));
557         }
558 }
559
560 /* This routine computes an IPv4 TCP checksum. */
561 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
562 {
563         const struct inet_sock *inet = inet_sk(sk);
564
565         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
566 }
567 EXPORT_SYMBOL(tcp_v4_send_check);
568
569 /*
570  *      This routine will send an RST to the other tcp.
571  *
572  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
573  *                    for reset.
574  *      Answer: if a packet caused RST, it is not for a socket
575  *              existing in our system, if it is matched to a socket,
576  *              it is just duplicate segment or bug in other side's TCP.
577  *              So that we build reply only basing on parameters
578  *              arrived with segment.
579  *      Exception: precedence violation. We do not implement it in any case.
580  */
581
582 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
583 {
584         const struct tcphdr *th = tcp_hdr(skb);
585         struct {
586                 struct tcphdr th;
587 #ifdef CONFIG_TCP_MD5SIG
588                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
589 #endif
590         } rep;
591         struct ip_reply_arg arg;
592 #ifdef CONFIG_TCP_MD5SIG
593         struct tcp_md5sig_key *key;
594         const __u8 *hash_location = NULL;
595         unsigned char newhash[16];
596         int genhash;
597         struct sock *sk1 = NULL;
598 #endif
599         struct net *net;
600
601         /* Never send a reset in response to a reset. */
602         if (th->rst)
603                 return;
604
605         /* If sk not NULL, it means we did a successful lookup and incoming
606          * route had to be correct. prequeue might have dropped our dst.
607          */
608         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
609                 return;
610
611         /* Swap the send and the receive. */
612         memset(&rep, 0, sizeof(rep));
613         rep.th.dest   = th->source;
614         rep.th.source = th->dest;
615         rep.th.doff   = sizeof(struct tcphdr) / 4;
616         rep.th.rst    = 1;
617
618         if (th->ack) {
619                 rep.th.seq = th->ack_seq;
620         } else {
621                 rep.th.ack = 1;
622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
623                                        skb->len - (th->doff << 2));
624         }
625
626         memset(&arg, 0, sizeof(arg));
627         arg.iov[0].iov_base = (unsigned char *)&rep;
628         arg.iov[0].iov_len  = sizeof(rep.th);
629
630         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
631 #ifdef CONFIG_TCP_MD5SIG
632         hash_location = tcp_parse_md5sig_option(th);
633         if (!sk && hash_location) {
634                 /*
635                  * active side is lost. Try to find listening socket through
636                  * source port, and then find md5 key through listening socket.
637                  * we are not loose security here:
638                  * Incoming packet is checked with md5 hash with finding key,
639                  * no RST generated if md5 hash doesn't match.
640                  */
641                 sk1 = __inet_lookup_listener(net,
642                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
643                                              th->source, ip_hdr(skb)->daddr,
644                                              ntohs(th->source), inet_iif(skb));
645                 /* don't send rst if it can't find key */
646                 if (!sk1)
647                         return;
648                 rcu_read_lock();
649                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
650                                         &ip_hdr(skb)->saddr, AF_INET);
651                 if (!key)
652                         goto release_sk1;
653
654                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
655                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
656                         goto release_sk1;
657         } else {
658                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
659                                              &ip_hdr(skb)->saddr,
660                                              AF_INET) : NULL;
661         }
662
663         if (key) {
664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665                                    (TCPOPT_NOP << 16) |
666                                    (TCPOPT_MD5SIG << 8) |
667                                    TCPOLEN_MD5SIG);
668                 /* Update length and the length the header thinks exists */
669                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670                 rep.th.doff = arg.iov[0].iov_len / 4;
671
672                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673                                      key, ip_hdr(skb)->saddr,
674                                      ip_hdr(skb)->daddr, &rep.th);
675         }
676 #endif
677         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678                                       ip_hdr(skb)->saddr, /* XXX */
679                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
680         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682         /* When socket is gone, all binding information is lost.
683          * routing might fail in this case. No choice here, if we choose to force
684          * input interface, we will misroute in case of asymmetric route.
685          */
686         if (sk)
687                 arg.bound_dev_if = sk->sk_bound_dev_if;
688
689         arg.tos = ip_hdr(skb)->tos;
690         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
691                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
692                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
693                               &arg, arg.iov[0].iov_len);
694
695         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
696         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
697
698 #ifdef CONFIG_TCP_MD5SIG
699 release_sk1:
700         if (sk1) {
701                 rcu_read_unlock();
702                 sock_put(sk1);
703         }
704 #endif
705 }
706
707 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
708    outside socket context is ugly, certainly. What can I do?
709  */
710
711 static void tcp_v4_send_ack(struct net *net,
712                             struct sk_buff *skb, u32 seq, u32 ack,
713                             u32 win, u32 tsval, u32 tsecr, int oif,
714                             struct tcp_md5sig_key *key,
715                             int reply_flags, u8 tos)
716 {
717         const struct tcphdr *th = tcp_hdr(skb);
718         struct {
719                 struct tcphdr th;
720                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
721 #ifdef CONFIG_TCP_MD5SIG
722                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
723 #endif
724                         ];
725         } rep;
726         struct ip_reply_arg arg;
727
728         memset(&rep.th, 0, sizeof(struct tcphdr));
729         memset(&arg, 0, sizeof(arg));
730
731         arg.iov[0].iov_base = (unsigned char *)&rep;
732         arg.iov[0].iov_len  = sizeof(rep.th);
733         if (tsecr) {
734                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
735                                    (TCPOPT_TIMESTAMP << 8) |
736                                    TCPOLEN_TIMESTAMP);
737                 rep.opt[1] = htonl(tsval);
738                 rep.opt[2] = htonl(tsecr);
739                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
740         }
741
742         /* Swap the send and the receive. */
743         rep.th.dest    = th->source;
744         rep.th.source  = th->dest;
745         rep.th.doff    = arg.iov[0].iov_len / 4;
746         rep.th.seq     = htonl(seq);
747         rep.th.ack_seq = htonl(ack);
748         rep.th.ack     = 1;
749         rep.th.window  = htons(win);
750
751 #ifdef CONFIG_TCP_MD5SIG
752         if (key) {
753                 int offset = (tsecr) ? 3 : 0;
754
755                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
756                                           (TCPOPT_NOP << 16) |
757                                           (TCPOPT_MD5SIG << 8) |
758                                           TCPOLEN_MD5SIG);
759                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
760                 rep.th.doff = arg.iov[0].iov_len/4;
761
762                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
763                                     key, ip_hdr(skb)->saddr,
764                                     ip_hdr(skb)->daddr, &rep.th);
765         }
766 #endif
767         arg.flags = reply_flags;
768         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
769                                       ip_hdr(skb)->saddr, /* XXX */
770                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
771         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
772         if (oif)
773                 arg.bound_dev_if = oif;
774         arg.tos = tos;
775         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
776                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
777                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778                               &arg, arg.iov[0].iov_len);
779
780         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
781 }
782
783 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
784 {
785         struct inet_timewait_sock *tw = inet_twsk(sk);
786         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
787
788         tcp_v4_send_ack(sock_net(sk), skb,
789                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
790                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
791                         tcp_time_stamp + tcptw->tw_ts_offset,
792                         tcptw->tw_ts_recent,
793                         tw->tw_bound_dev_if,
794                         tcp_twsk_md5_key(tcptw),
795                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
796                         tw->tw_tos
797                         );
798
799         inet_twsk_put(tw);
800 }
801
802 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
803                                   struct request_sock *req)
804 {
805         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
806          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
807          */
808         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
809                                              tcp_sk(sk)->snd_nxt;
810
811         /* RFC 7323 2.3
812          * The window field (SEG.WND) of every outgoing segment, with the
813          * exception of <SYN> segments, MUST be right-shifted by
814          * Rcv.Wind.Shift bits:
815          */
816         tcp_v4_send_ack(sock_net(sk), skb, seq,
817                         tcp_rsk(req)->rcv_nxt,
818                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
819                         tcp_time_stamp,
820                         req->ts_recent,
821                         0,
822                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
823                                           AF_INET),
824                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
825                         ip_hdr(skb)->tos);
826 }
827
828 /*
829  *      Send a SYN-ACK after having received a SYN.
830  *      This still operates on a request_sock only, not on a big
831  *      socket.
832  */
833 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
834                               struct flowi *fl,
835                               struct request_sock *req,
836                               struct tcp_fastopen_cookie *foc,
837                                   bool attach_req)
838 {
839         const struct inet_request_sock *ireq = inet_rsk(req);
840         struct flowi4 fl4;
841         int err = -1;
842         struct sk_buff *skb;
843
844         /* First, grab a route. */
845         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
846                 return -1;
847
848         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
849
850         if (skb) {
851                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
852
853                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
854                                             ireq->ir_rmt_addr,
855                                             ireq->opt);
856                 err = net_xmit_eval(err);
857         }
858
859         return err;
860 }
861
862 /*
863  *      IPv4 request_sock destructor.
864  */
865 static void tcp_v4_reqsk_destructor(struct request_sock *req)
866 {
867         kfree(inet_rsk(req)->opt);
868 }
869
870
871 #ifdef CONFIG_TCP_MD5SIG
872 /*
873  * RFC2385 MD5 checksumming requires a mapping of
874  * IP address->MD5 Key.
875  * We need to maintain these in the sk structure.
876  */
877
878 /* Find the Key structure for an address.  */
879 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
880                                          const union tcp_md5_addr *addr,
881                                          int family)
882 {
883         const struct tcp_sock *tp = tcp_sk(sk);
884         struct tcp_md5sig_key *key;
885         unsigned int size = sizeof(struct in_addr);
886         const struct tcp_md5sig_info *md5sig;
887
888         /* caller either holds rcu_read_lock() or socket lock */
889         md5sig = rcu_dereference_check(tp->md5sig_info,
890                                        sock_owned_by_user(sk) ||
891                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
892         if (!md5sig)
893                 return NULL;
894 #if IS_ENABLED(CONFIG_IPV6)
895         if (family == AF_INET6)
896                 size = sizeof(struct in6_addr);
897 #endif
898         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
899                 if (key->family != family)
900                         continue;
901                 if (!memcmp(&key->addr, addr, size))
902                         return key;
903         }
904         return NULL;
905 }
906 EXPORT_SYMBOL(tcp_md5_do_lookup);
907
908 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
909                                          const struct sock *addr_sk)
910 {
911         const union tcp_md5_addr *addr;
912
913         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
914         return tcp_md5_do_lookup(sk, addr, AF_INET);
915 }
916 EXPORT_SYMBOL(tcp_v4_md5_lookup);
917
918 /* This can be called on a newly created socket, from other files */
919 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
920                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
921 {
922         /* Add Key to the list */
923         struct tcp_md5sig_key *key;
924         struct tcp_sock *tp = tcp_sk(sk);
925         struct tcp_md5sig_info *md5sig;
926
927         key = tcp_md5_do_lookup(sk, addr, family);
928         if (key) {
929                 /* Pre-existing entry - just update that one. */
930                 memcpy(key->key, newkey, newkeylen);
931                 key->keylen = newkeylen;
932                 return 0;
933         }
934
935         md5sig = rcu_dereference_protected(tp->md5sig_info,
936                                            sock_owned_by_user(sk) ||
937                                            lockdep_is_held(&sk->sk_lock.slock));
938         if (!md5sig) {
939                 md5sig = kmalloc(sizeof(*md5sig), gfp);
940                 if (!md5sig)
941                         return -ENOMEM;
942
943                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
944                 INIT_HLIST_HEAD(&md5sig->head);
945                 rcu_assign_pointer(tp->md5sig_info, md5sig);
946         }
947
948         key = sock_kmalloc(sk, sizeof(*key), gfp);
949         if (!key)
950                 return -ENOMEM;
951         if (!tcp_alloc_md5sig_pool()) {
952                 sock_kfree_s(sk, key, sizeof(*key));
953                 return -ENOMEM;
954         }
955
956         memcpy(key->key, newkey, newkeylen);
957         key->keylen = newkeylen;
958         key->family = family;
959         memcpy(&key->addr, addr,
960                (family == AF_INET6) ? sizeof(struct in6_addr) :
961                                       sizeof(struct in_addr));
962         hlist_add_head_rcu(&key->node, &md5sig->head);
963         return 0;
964 }
965 EXPORT_SYMBOL(tcp_md5_do_add);
966
967 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
968 {
969         struct tcp_md5sig_key *key;
970
971         key = tcp_md5_do_lookup(sk, addr, family);
972         if (!key)
973                 return -ENOENT;
974         hlist_del_rcu(&key->node);
975         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
976         kfree_rcu(key, rcu);
977         return 0;
978 }
979 EXPORT_SYMBOL(tcp_md5_do_del);
980
981 static void tcp_clear_md5_list(struct sock *sk)
982 {
983         struct tcp_sock *tp = tcp_sk(sk);
984         struct tcp_md5sig_key *key;
985         struct hlist_node *n;
986         struct tcp_md5sig_info *md5sig;
987
988         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
989
990         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
991                 hlist_del_rcu(&key->node);
992                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
993                 kfree_rcu(key, rcu);
994         }
995 }
996
997 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
998                                  int optlen)
999 {
1000         struct tcp_md5sig cmd;
1001         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1002
1003         if (optlen < sizeof(cmd))
1004                 return -EINVAL;
1005
1006         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1007                 return -EFAULT;
1008
1009         if (sin->sin_family != AF_INET)
1010                 return -EINVAL;
1011
1012         if (!cmd.tcpm_keylen)
1013                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1014                                       AF_INET);
1015
1016         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1017                 return -EINVAL;
1018
1019         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1020                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1021                               GFP_KERNEL);
1022 }
1023
1024 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1025                                         __be32 daddr, __be32 saddr, int nbytes)
1026 {
1027         struct tcp4_pseudohdr *bp;
1028         struct scatterlist sg;
1029
1030         bp = &hp->md5_blk.ip4;
1031
1032         /*
1033          * 1. the TCP pseudo-header (in the order: source IP address,
1034          * destination IP address, zero-padded protocol number, and
1035          * segment length)
1036          */
1037         bp->saddr = saddr;
1038         bp->daddr = daddr;
1039         bp->pad = 0;
1040         bp->protocol = IPPROTO_TCP;
1041         bp->len = cpu_to_be16(nbytes);
1042
1043         sg_init_one(&sg, bp, sizeof(*bp));
1044         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1045 }
1046
1047 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1048                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1049 {
1050         struct tcp_md5sig_pool *hp;
1051         struct hash_desc *desc;
1052
1053         hp = tcp_get_md5sig_pool();
1054         if (!hp)
1055                 goto clear_hash_noput;
1056         desc = &hp->md5_desc;
1057
1058         if (crypto_hash_init(desc))
1059                 goto clear_hash;
1060         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1061                 goto clear_hash;
1062         if (tcp_md5_hash_header(hp, th))
1063                 goto clear_hash;
1064         if (tcp_md5_hash_key(hp, key))
1065                 goto clear_hash;
1066         if (crypto_hash_final(desc, md5_hash))
1067                 goto clear_hash;
1068
1069         tcp_put_md5sig_pool();
1070         return 0;
1071
1072 clear_hash:
1073         tcp_put_md5sig_pool();
1074 clear_hash_noput:
1075         memset(md5_hash, 0, 16);
1076         return 1;
1077 }
1078
1079 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1080                         const struct sock *sk,
1081                         const struct sk_buff *skb)
1082 {
1083         struct tcp_md5sig_pool *hp;
1084         struct hash_desc *desc;
1085         const struct tcphdr *th = tcp_hdr(skb);
1086         __be32 saddr, daddr;
1087
1088         if (sk) { /* valid for establish/request sockets */
1089                 saddr = sk->sk_rcv_saddr;
1090                 daddr = sk->sk_daddr;
1091         } else {
1092                 const struct iphdr *iph = ip_hdr(skb);
1093                 saddr = iph->saddr;
1094                 daddr = iph->daddr;
1095         }
1096
1097         hp = tcp_get_md5sig_pool();
1098         if (!hp)
1099                 goto clear_hash_noput;
1100         desc = &hp->md5_desc;
1101
1102         if (crypto_hash_init(desc))
1103                 goto clear_hash;
1104
1105         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_header(hp, th))
1108                 goto clear_hash;
1109         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1110                 goto clear_hash;
1111         if (tcp_md5_hash_key(hp, key))
1112                 goto clear_hash;
1113         if (crypto_hash_final(desc, md5_hash))
1114                 goto clear_hash;
1115
1116         tcp_put_md5sig_pool();
1117         return 0;
1118
1119 clear_hash:
1120         tcp_put_md5sig_pool();
1121 clear_hash_noput:
1122         memset(md5_hash, 0, 16);
1123         return 1;
1124 }
1125 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1126
1127 #endif
1128
1129 /* Called with rcu_read_lock() */
1130 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1131                                     const struct sk_buff *skb)
1132 {
1133 #ifdef CONFIG_TCP_MD5SIG
1134         /*
1135          * This gets called for each TCP segment that arrives
1136          * so we want to be efficient.
1137          * We have 3 drop cases:
1138          * o No MD5 hash and one expected.
1139          * o MD5 hash and we're not expecting one.
1140          * o MD5 hash and its wrong.
1141          */
1142         const __u8 *hash_location = NULL;
1143         struct tcp_md5sig_key *hash_expected;
1144         const struct iphdr *iph = ip_hdr(skb);
1145         const struct tcphdr *th = tcp_hdr(skb);
1146         int genhash;
1147         unsigned char newhash[16];
1148
1149         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1150                                           AF_INET);
1151         hash_location = tcp_parse_md5sig_option(th);
1152
1153         /* We've parsed the options - do we have a hash? */
1154         if (!hash_expected && !hash_location)
1155                 return false;
1156
1157         if (hash_expected && !hash_location) {
1158                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1159                 return true;
1160         }
1161
1162         if (!hash_expected && hash_location) {
1163                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1164                 return true;
1165         }
1166
1167         /* Okay, so this is hash_expected and hash_location -
1168          * so we need to calculate the checksum.
1169          */
1170         genhash = tcp_v4_md5_hash_skb(newhash,
1171                                       hash_expected,
1172                                       NULL, skb);
1173
1174         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1175                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1176                                      &iph->saddr, ntohs(th->source),
1177                                      &iph->daddr, ntohs(th->dest),
1178                                      genhash ? " tcp_v4_calc_md5_hash failed"
1179                                      : "");
1180                 return true;
1181         }
1182         return false;
1183 #endif
1184         return false;
1185 }
1186
1187 static void tcp_v4_init_req(struct request_sock *req,
1188                             const struct sock *sk_listener,
1189                             struct sk_buff *skb)
1190 {
1191         struct inet_request_sock *ireq = inet_rsk(req);
1192
1193         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1194         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1195         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1196         ireq->opt = tcp_v4_save_options(skb);
1197 }
1198
1199 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1200                                           struct flowi *fl,
1201                                           const struct request_sock *req,
1202                                           bool *strict)
1203 {
1204         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1205
1206         if (strict) {
1207                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1208                         *strict = true;
1209                 else
1210                         *strict = false;
1211         }
1212
1213         return dst;
1214 }
1215
1216 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1217         .family         =       PF_INET,
1218         .obj_size       =       sizeof(struct tcp_request_sock),
1219         .rtx_syn_ack    =       tcp_rtx_synack,
1220         .send_ack       =       tcp_v4_reqsk_send_ack,
1221         .destructor     =       tcp_v4_reqsk_destructor,
1222         .send_reset     =       tcp_v4_send_reset,
1223         .syn_ack_timeout =      tcp_syn_ack_timeout,
1224 };
1225
1226 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1227         .mss_clamp      =       TCP_MSS_DEFAULT,
1228 #ifdef CONFIG_TCP_MD5SIG
1229         .req_md5_lookup =       tcp_v4_md5_lookup,
1230         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1231 #endif
1232         .init_req       =       tcp_v4_init_req,
1233 #ifdef CONFIG_SYN_COOKIES
1234         .cookie_init_seq =      cookie_v4_init_sequence,
1235 #endif
1236         .route_req      =       tcp_v4_route_req,
1237         .init_seq       =       tcp_v4_init_sequence,
1238         .send_synack    =       tcp_v4_send_synack,
1239 };
1240
1241 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1242 {
1243         /* Never answer to SYNs send to broadcast or multicast */
1244         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1245                 goto drop;
1246
1247         return tcp_conn_request(&tcp_request_sock_ops,
1248                                 &tcp_request_sock_ipv4_ops, sk, skb);
1249
1250 drop:
1251         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1252         return 0;
1253 }
1254 EXPORT_SYMBOL(tcp_v4_conn_request);
1255
1256
1257 /*
1258  * The three way handshake has completed - we got a valid synack -
1259  * now create the new socket.
1260  */
1261 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1262                                   struct request_sock *req,
1263                                   struct dst_entry *dst,
1264                                   struct request_sock *req_unhash,
1265                                   bool *own_req)
1266 {
1267         struct inet_request_sock *ireq;
1268         struct inet_sock *newinet;
1269         struct tcp_sock *newtp;
1270         struct sock *newsk;
1271 #ifdef CONFIG_TCP_MD5SIG
1272         struct tcp_md5sig_key *key;
1273 #endif
1274         struct ip_options_rcu *inet_opt;
1275
1276         if (sk_acceptq_is_full(sk))
1277                 goto exit_overflow;
1278
1279         newsk = tcp_create_openreq_child(sk, req, skb);
1280         if (!newsk)
1281                 goto exit_nonewsk;
1282
1283         newsk->sk_gso_type = SKB_GSO_TCPV4;
1284         inet_sk_rx_dst_set(newsk, skb);
1285
1286         newtp                 = tcp_sk(newsk);
1287         newinet               = inet_sk(newsk);
1288         ireq                  = inet_rsk(req);
1289         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1290         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1291         newinet->inet_saddr           = ireq->ir_loc_addr;
1292         inet_opt              = ireq->opt;
1293         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1294         ireq->opt             = NULL;
1295         newinet->mc_index     = inet_iif(skb);
1296         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1297         newinet->rcv_tos      = ip_hdr(skb)->tos;
1298         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1299         if (inet_opt)
1300                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1301         newinet->inet_id = newtp->write_seq ^ jiffies;
1302
1303         if (!dst) {
1304                 dst = inet_csk_route_child_sock(sk, newsk, req);
1305                 if (!dst)
1306                         goto put_and_exit;
1307         } else {
1308                 /* syncookie case : see end of cookie_v4_check() */
1309         }
1310         sk_setup_caps(newsk, dst);
1311
1312         tcp_ca_openreq_child(newsk, dst);
1313
1314         tcp_sync_mss(newsk, dst_mtu(dst));
1315         newtp->advmss = dst_metric_advmss(dst);
1316         if (tcp_sk(sk)->rx_opt.user_mss &&
1317             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1318                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1319
1320         tcp_initialize_rcv_mss(newsk);
1321
1322 #ifdef CONFIG_TCP_MD5SIG
1323         /* Copy over the MD5 key from the original socket */
1324         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1325                                 AF_INET);
1326         if (key) {
1327                 /*
1328                  * We're using one, so create a matching key
1329                  * on the newsk structure. If we fail to get
1330                  * memory, then we end up not copying the key
1331                  * across. Shucks.
1332                  */
1333                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1334                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1335                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1336         }
1337 #endif
1338
1339         if (__inet_inherit_port(sk, newsk) < 0)
1340                 goto put_and_exit;
1341         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1342         if (*own_req)
1343                 tcp_move_syn(newtp, req);
1344
1345         return newsk;
1346
1347 exit_overflow:
1348         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1349 exit_nonewsk:
1350         dst_release(dst);
1351 exit:
1352         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1353         return NULL;
1354 put_and_exit:
1355         inet_csk_prepare_forced_close(newsk);
1356         tcp_done(newsk);
1357         goto exit;
1358 }
1359 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1360
1361 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1362 {
1363 #ifdef CONFIG_SYN_COOKIES
1364         const struct tcphdr *th = tcp_hdr(skb);
1365
1366         if (!th->syn)
1367                 sk = cookie_v4_check(sk, skb);
1368 #endif
1369         return sk;
1370 }
1371
1372 /* The socket must have it's spinlock held when we get
1373  * here, unless it is a TCP_LISTEN socket.
1374  *
1375  * We have a potential double-lock case here, so even when
1376  * doing backlog processing we use the BH locking scheme.
1377  * This is because we cannot sleep with the original spinlock
1378  * held.
1379  */
1380 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1381 {
1382         struct sock *rsk;
1383
1384         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1385                 struct dst_entry *dst = sk->sk_rx_dst;
1386
1387                 sock_rps_save_rxhash(sk, skb);
1388                 sk_mark_napi_id(sk, skb);
1389                 if (dst) {
1390                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1391                             !dst->ops->check(dst, 0)) {
1392                                 dst_release(dst);
1393                                 sk->sk_rx_dst = NULL;
1394                         }
1395                 }
1396                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1397                 return 0;
1398         }
1399
1400         if (tcp_checksum_complete(skb))
1401                 goto csum_err;
1402
1403         if (sk->sk_state == TCP_LISTEN) {
1404                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1405
1406                 if (!nsk)
1407                         goto discard;
1408                 if (nsk != sk) {
1409                         sock_rps_save_rxhash(nsk, skb);
1410                         sk_mark_napi_id(nsk, skb);
1411                         if (tcp_child_process(sk, nsk, skb)) {
1412                                 rsk = nsk;
1413                                 goto reset;
1414                         }
1415                         return 0;
1416                 }
1417         } else
1418                 sock_rps_save_rxhash(sk, skb);
1419
1420         if (tcp_rcv_state_process(sk, skb)) {
1421                 rsk = sk;
1422                 goto reset;
1423         }
1424         return 0;
1425
1426 reset:
1427         tcp_v4_send_reset(rsk, skb);
1428 discard:
1429         kfree_skb(skb);
1430         /* Be careful here. If this function gets more complicated and
1431          * gcc suffers from register pressure on the x86, sk (in %ebx)
1432          * might be destroyed here. This current version compiles correctly,
1433          * but you have been warned.
1434          */
1435         return 0;
1436
1437 csum_err:
1438         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1439         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1440         goto discard;
1441 }
1442 EXPORT_SYMBOL(tcp_v4_do_rcv);
1443
1444 void tcp_v4_early_demux(struct sk_buff *skb)
1445 {
1446         const struct iphdr *iph;
1447         const struct tcphdr *th;
1448         struct sock *sk;
1449
1450         if (skb->pkt_type != PACKET_HOST)
1451                 return;
1452
1453         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1454                 return;
1455
1456         iph = ip_hdr(skb);
1457         th = tcp_hdr(skb);
1458
1459         if (th->doff < sizeof(struct tcphdr) / 4)
1460                 return;
1461
1462         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1463                                        iph->saddr, th->source,
1464                                        iph->daddr, ntohs(th->dest),
1465                                        skb->skb_iif);
1466         if (sk) {
1467                 skb->sk = sk;
1468                 skb->destructor = sock_edemux;
1469                 if (sk_fullsock(sk)) {
1470                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1471
1472                         if (dst)
1473                                 dst = dst_check(dst, 0);
1474                         if (dst &&
1475                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1476                                 skb_dst_set_noref(skb, dst);
1477                 }
1478         }
1479 }
1480
1481 /* Packet is added to VJ-style prequeue for processing in process
1482  * context, if a reader task is waiting. Apparently, this exciting
1483  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1484  * failed somewhere. Latency? Burstiness? Well, at least now we will
1485  * see, why it failed. 8)8)                               --ANK
1486  *
1487  */
1488 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1489 {
1490         struct tcp_sock *tp = tcp_sk(sk);
1491
1492         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1493                 return false;
1494
1495         if (skb->len <= tcp_hdrlen(skb) &&
1496             skb_queue_len(&tp->ucopy.prequeue) == 0)
1497                 return false;
1498
1499         /* Before escaping RCU protected region, we need to take care of skb
1500          * dst. Prequeue is only enabled for established sockets.
1501          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1502          * Instead of doing full sk_rx_dst validity here, let's perform
1503          * an optimistic check.
1504          */
1505         if (likely(sk->sk_rx_dst))
1506                 skb_dst_drop(skb);
1507         else
1508                 skb_dst_force_safe(skb);
1509
1510         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1511         tp->ucopy.memory += skb->truesize;
1512         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1513                 struct sk_buff *skb1;
1514
1515                 BUG_ON(sock_owned_by_user(sk));
1516
1517                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1518                         sk_backlog_rcv(sk, skb1);
1519                         NET_INC_STATS_BH(sock_net(sk),
1520                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1521                 }
1522
1523                 tp->ucopy.memory = 0;
1524         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1525                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1526                                            POLLIN | POLLRDNORM | POLLRDBAND);
1527                 if (!inet_csk_ack_scheduled(sk))
1528                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1529                                                   (3 * tcp_rto_min(sk)) / 4,
1530                                                   TCP_RTO_MAX);
1531         }
1532         return true;
1533 }
1534 EXPORT_SYMBOL(tcp_prequeue);
1535
1536 /*
1537  *      From tcp_input.c
1538  */
1539
1540 int tcp_v4_rcv(struct sk_buff *skb)
1541 {
1542         const struct iphdr *iph;
1543         const struct tcphdr *th;
1544         struct sock *sk;
1545         int ret;
1546         struct net *net = dev_net(skb->dev);
1547
1548         if (skb->pkt_type != PACKET_HOST)
1549                 goto discard_it;
1550
1551         /* Count it even if it's bad */
1552         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1553
1554         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1555                 goto discard_it;
1556
1557         th = tcp_hdr(skb);
1558
1559         if (th->doff < sizeof(struct tcphdr) / 4)
1560                 goto bad_packet;
1561         if (!pskb_may_pull(skb, th->doff * 4))
1562                 goto discard_it;
1563
1564         /* An explanation is required here, I think.
1565          * Packet length and doff are validated by header prediction,
1566          * provided case of th->doff==0 is eliminated.
1567          * So, we defer the checks. */
1568
1569         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1570                 goto csum_error;
1571
1572         th = tcp_hdr(skb);
1573         iph = ip_hdr(skb);
1574         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1575          * barrier() makes sure compiler wont play fool^Waliasing games.
1576          */
1577         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1578                 sizeof(struct inet_skb_parm));
1579         barrier();
1580
1581         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1582         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1583                                     skb->len - th->doff * 4);
1584         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1585         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1586         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1587         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1588         TCP_SKB_CB(skb)->sacked  = 0;
1589
1590 lookup:
1591         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1592         if (!sk)
1593                 goto no_tcp_socket;
1594
1595 process:
1596         if (sk->sk_state == TCP_TIME_WAIT)
1597                 goto do_time_wait;
1598
1599         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1600                 struct request_sock *req = inet_reqsk(sk);
1601                 struct sock *nsk;
1602
1603                 sk = req->rsk_listener;
1604                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1605                         reqsk_put(req);
1606                         goto discard_it;
1607                 }
1608                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1609                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1610                         goto lookup;
1611                 }
1612                 sock_hold(sk);
1613                 nsk = tcp_check_req(sk, skb, req, false);
1614                 if (!nsk) {
1615                         reqsk_put(req);
1616                         goto discard_and_relse;
1617                 }
1618                 if (nsk == sk) {
1619                         reqsk_put(req);
1620                 } else if (tcp_child_process(sk, nsk, skb)) {
1621                         tcp_v4_send_reset(nsk, skb);
1622                         goto discard_and_relse;
1623                 } else {
1624                         sock_put(sk);
1625                         return 0;
1626                 }
1627         }
1628         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1629                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1630                 goto discard_and_relse;
1631         }
1632
1633         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1634                 goto discard_and_relse;
1635
1636         if (tcp_v4_inbound_md5_hash(sk, skb))
1637                 goto discard_and_relse;
1638
1639         nf_reset(skb);
1640
1641         if (sk_filter(sk, skb))
1642                 goto discard_and_relse;
1643
1644         skb->dev = NULL;
1645
1646         if (sk->sk_state == TCP_LISTEN) {
1647                 ret = tcp_v4_do_rcv(sk, skb);
1648                 goto put_and_return;
1649         }
1650
1651         sk_incoming_cpu_update(sk);
1652
1653         bh_lock_sock_nested(sk);
1654         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1655         ret = 0;
1656         if (!sock_owned_by_user(sk)) {
1657                 if (!tcp_prequeue(sk, skb))
1658                         ret = tcp_v4_do_rcv(sk, skb);
1659         } else if (unlikely(sk_add_backlog(sk, skb,
1660                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1661                 bh_unlock_sock(sk);
1662                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1663                 goto discard_and_relse;
1664         }
1665         bh_unlock_sock(sk);
1666
1667 put_and_return:
1668         sock_put(sk);
1669
1670         return ret;
1671
1672 no_tcp_socket:
1673         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1674                 goto discard_it;
1675
1676         if (tcp_checksum_complete(skb)) {
1677 csum_error:
1678                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1679 bad_packet:
1680                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1681         } else {
1682                 tcp_v4_send_reset(NULL, skb);
1683         }
1684
1685 discard_it:
1686         /* Discard frame. */
1687         kfree_skb(skb);
1688         return 0;
1689
1690 discard_and_relse:
1691         sock_put(sk);
1692         goto discard_it;
1693
1694 do_time_wait:
1695         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1696                 inet_twsk_put(inet_twsk(sk));
1697                 goto discard_it;
1698         }
1699
1700         if (tcp_checksum_complete(skb)) {
1701                 inet_twsk_put(inet_twsk(sk));
1702                 goto csum_error;
1703         }
1704         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1705         case TCP_TW_SYN: {
1706                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1707                                                         &tcp_hashinfo,
1708                                                         iph->saddr, th->source,
1709                                                         iph->daddr, th->dest,
1710                                                         inet_iif(skb));
1711                 if (sk2) {
1712                         inet_twsk_deschedule_put(inet_twsk(sk));
1713                         sk = sk2;
1714                         goto process;
1715                 }
1716                 /* Fall through to ACK */
1717         }
1718         case TCP_TW_ACK:
1719                 tcp_v4_timewait_ack(sk, skb);
1720                 break;
1721         case TCP_TW_RST:
1722                 goto no_tcp_socket;
1723         case TCP_TW_SUCCESS:;
1724         }
1725         goto discard_it;
1726 }
1727
1728 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1729         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1730         .twsk_unique    = tcp_twsk_unique,
1731         .twsk_destructor= tcp_twsk_destructor,
1732 };
1733
1734 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1735 {
1736         struct dst_entry *dst = skb_dst(skb);
1737
1738         if (dst && dst_hold_safe(dst)) {
1739                 sk->sk_rx_dst = dst;
1740                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1741         }
1742 }
1743 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1744
1745 const struct inet_connection_sock_af_ops ipv4_specific = {
1746         .queue_xmit        = ip_queue_xmit,
1747         .send_check        = tcp_v4_send_check,
1748         .rebuild_header    = inet_sk_rebuild_header,
1749         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1750         .conn_request      = tcp_v4_conn_request,
1751         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1752         .net_header_len    = sizeof(struct iphdr),
1753         .setsockopt        = ip_setsockopt,
1754         .getsockopt        = ip_getsockopt,
1755         .addr2sockaddr     = inet_csk_addr2sockaddr,
1756         .sockaddr_len      = sizeof(struct sockaddr_in),
1757         .bind_conflict     = inet_csk_bind_conflict,
1758 #ifdef CONFIG_COMPAT
1759         .compat_setsockopt = compat_ip_setsockopt,
1760         .compat_getsockopt = compat_ip_getsockopt,
1761 #endif
1762         .mtu_reduced       = tcp_v4_mtu_reduced,
1763 };
1764 EXPORT_SYMBOL(ipv4_specific);
1765
1766 #ifdef CONFIG_TCP_MD5SIG
1767 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1768         .md5_lookup             = tcp_v4_md5_lookup,
1769         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1770         .md5_parse              = tcp_v4_parse_md5_keys,
1771 };
1772 #endif
1773
1774 /* NOTE: A lot of things set to zero explicitly by call to
1775  *       sk_alloc() so need not be done here.
1776  */
1777 static int tcp_v4_init_sock(struct sock *sk)
1778 {
1779         struct inet_connection_sock *icsk = inet_csk(sk);
1780
1781         tcp_init_sock(sk);
1782
1783         icsk->icsk_af_ops = &ipv4_specific;
1784
1785 #ifdef CONFIG_TCP_MD5SIG
1786         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1787 #endif
1788
1789         return 0;
1790 }
1791
1792 void tcp_v4_destroy_sock(struct sock *sk)
1793 {
1794         struct tcp_sock *tp = tcp_sk(sk);
1795
1796         tcp_clear_xmit_timers(sk);
1797
1798         tcp_cleanup_congestion_control(sk);
1799
1800         /* Cleanup up the write buffer. */
1801         tcp_write_queue_purge(sk);
1802
1803         /* Cleans up our, hopefully empty, out_of_order_queue. */
1804         __skb_queue_purge(&tp->out_of_order_queue);
1805
1806 #ifdef CONFIG_TCP_MD5SIG
1807         /* Clean up the MD5 key list, if any */
1808         if (tp->md5sig_info) {
1809                 tcp_clear_md5_list(sk);
1810                 kfree_rcu(tp->md5sig_info, rcu);
1811                 tp->md5sig_info = NULL;
1812         }
1813 #endif
1814
1815         /* Clean prequeue, it must be empty really */
1816         __skb_queue_purge(&tp->ucopy.prequeue);
1817
1818         /* Clean up a referenced TCP bind bucket. */
1819         if (inet_csk(sk)->icsk_bind_hash)
1820                 inet_put_port(sk);
1821
1822         BUG_ON(tp->fastopen_rsk);
1823
1824         /* If socket is aborted during connect operation */
1825         tcp_free_fastopen_req(tp);
1826         tcp_saved_syn_free(tp);
1827
1828         sk_sockets_allocated_dec(sk);
1829         sock_release_memcg(sk);
1830 }
1831 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1832
1833 #ifdef CONFIG_PROC_FS
1834 /* Proc filesystem TCP sock list dumping. */
1835
1836 /*
1837  * Get next listener socket follow cur.  If cur is NULL, get first socket
1838  * starting from bucket given in st->bucket; when st->bucket is zero the
1839  * very first socket in the hash table is returned.
1840  */
1841 static void *listening_get_next(struct seq_file *seq, void *cur)
1842 {
1843         struct inet_connection_sock *icsk;
1844         struct hlist_nulls_node *node;
1845         struct sock *sk = cur;
1846         struct inet_listen_hashbucket *ilb;
1847         struct tcp_iter_state *st = seq->private;
1848         struct net *net = seq_file_net(seq);
1849
1850         if (!sk) {
1851                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1852                 spin_lock_bh(&ilb->lock);
1853                 sk = sk_nulls_head(&ilb->head);
1854                 st->offset = 0;
1855                 goto get_sk;
1856         }
1857         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1858         ++st->num;
1859         ++st->offset;
1860
1861         sk = sk_nulls_next(sk);
1862 get_sk:
1863         sk_nulls_for_each_from(sk, node) {
1864                 if (!net_eq(sock_net(sk), net))
1865                         continue;
1866                 if (sk->sk_family == st->family) {
1867                         cur = sk;
1868                         goto out;
1869                 }
1870                 icsk = inet_csk(sk);
1871         }
1872         spin_unlock_bh(&ilb->lock);
1873         st->offset = 0;
1874         if (++st->bucket < INET_LHTABLE_SIZE) {
1875                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876                 spin_lock_bh(&ilb->lock);
1877                 sk = sk_nulls_head(&ilb->head);
1878                 goto get_sk;
1879         }
1880         cur = NULL;
1881 out:
1882         return cur;
1883 }
1884
1885 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1886 {
1887         struct tcp_iter_state *st = seq->private;
1888         void *rc;
1889
1890         st->bucket = 0;
1891         st->offset = 0;
1892         rc = listening_get_next(seq, NULL);
1893
1894         while (rc && *pos) {
1895                 rc = listening_get_next(seq, rc);
1896                 --*pos;
1897         }
1898         return rc;
1899 }
1900
1901 static inline bool empty_bucket(const struct tcp_iter_state *st)
1902 {
1903         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1904 }
1905
1906 /*
1907  * Get first established socket starting from bucket given in st->bucket.
1908  * If st->bucket is zero, the very first socket in the hash is returned.
1909  */
1910 static void *established_get_first(struct seq_file *seq)
1911 {
1912         struct tcp_iter_state *st = seq->private;
1913         struct net *net = seq_file_net(seq);
1914         void *rc = NULL;
1915
1916         st->offset = 0;
1917         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1918                 struct sock *sk;
1919                 struct hlist_nulls_node *node;
1920                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1921
1922                 /* Lockless fast path for the common case of empty buckets */
1923                 if (empty_bucket(st))
1924                         continue;
1925
1926                 spin_lock_bh(lock);
1927                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1928                         if (sk->sk_family != st->family ||
1929                             !net_eq(sock_net(sk), net)) {
1930                                 continue;
1931                         }
1932                         rc = sk;
1933                         goto out;
1934                 }
1935                 spin_unlock_bh(lock);
1936         }
1937 out:
1938         return rc;
1939 }
1940
1941 static void *established_get_next(struct seq_file *seq, void *cur)
1942 {
1943         struct sock *sk = cur;
1944         struct hlist_nulls_node *node;
1945         struct tcp_iter_state *st = seq->private;
1946         struct net *net = seq_file_net(seq);
1947
1948         ++st->num;
1949         ++st->offset;
1950
1951         sk = sk_nulls_next(sk);
1952
1953         sk_nulls_for_each_from(sk, node) {
1954                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1955                         return sk;
1956         }
1957
1958         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1959         ++st->bucket;
1960         return established_get_first(seq);
1961 }
1962
1963 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1964 {
1965         struct tcp_iter_state *st = seq->private;
1966         void *rc;
1967
1968         st->bucket = 0;
1969         rc = established_get_first(seq);
1970
1971         while (rc && pos) {
1972                 rc = established_get_next(seq, rc);
1973                 --pos;
1974         }
1975         return rc;
1976 }
1977
1978 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1979 {
1980         void *rc;
1981         struct tcp_iter_state *st = seq->private;
1982
1983         st->state = TCP_SEQ_STATE_LISTENING;
1984         rc        = listening_get_idx(seq, &pos);
1985
1986         if (!rc) {
1987                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1988                 rc        = established_get_idx(seq, pos);
1989         }
1990
1991         return rc;
1992 }
1993
1994 static void *tcp_seek_last_pos(struct seq_file *seq)
1995 {
1996         struct tcp_iter_state *st = seq->private;
1997         int offset = st->offset;
1998         int orig_num = st->num;
1999         void *rc = NULL;
2000
2001         switch (st->state) {
2002         case TCP_SEQ_STATE_LISTENING:
2003                 if (st->bucket >= INET_LHTABLE_SIZE)
2004                         break;
2005                 st->state = TCP_SEQ_STATE_LISTENING;
2006                 rc = listening_get_next(seq, NULL);
2007                 while (offset-- && rc)
2008                         rc = listening_get_next(seq, rc);
2009                 if (rc)
2010                         break;
2011                 st->bucket = 0;
2012                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2013                 /* Fallthrough */
2014         case TCP_SEQ_STATE_ESTABLISHED:
2015                 if (st->bucket > tcp_hashinfo.ehash_mask)
2016                         break;
2017                 rc = established_get_first(seq);
2018                 while (offset-- && rc)
2019                         rc = established_get_next(seq, rc);
2020         }
2021
2022         st->num = orig_num;
2023
2024         return rc;
2025 }
2026
2027 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2028 {
2029         struct tcp_iter_state *st = seq->private;
2030         void *rc;
2031
2032         if (*pos && *pos == st->last_pos) {
2033                 rc = tcp_seek_last_pos(seq);
2034                 if (rc)
2035                         goto out;
2036         }
2037
2038         st->state = TCP_SEQ_STATE_LISTENING;
2039         st->num = 0;
2040         st->bucket = 0;
2041         st->offset = 0;
2042         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2043
2044 out:
2045         st->last_pos = *pos;
2046         return rc;
2047 }
2048
2049 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2050 {
2051         struct tcp_iter_state *st = seq->private;
2052         void *rc = NULL;
2053
2054         if (v == SEQ_START_TOKEN) {
2055                 rc = tcp_get_idx(seq, 0);
2056                 goto out;
2057         }
2058
2059         switch (st->state) {
2060         case TCP_SEQ_STATE_LISTENING:
2061                 rc = listening_get_next(seq, v);
2062                 if (!rc) {
2063                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2064                         st->bucket = 0;
2065                         st->offset = 0;
2066                         rc        = established_get_first(seq);
2067                 }
2068                 break;
2069         case TCP_SEQ_STATE_ESTABLISHED:
2070                 rc = established_get_next(seq, v);
2071                 break;
2072         }
2073 out:
2074         ++*pos;
2075         st->last_pos = *pos;
2076         return rc;
2077 }
2078
2079 static void tcp_seq_stop(struct seq_file *seq, void *v)
2080 {
2081         struct tcp_iter_state *st = seq->private;
2082
2083         switch (st->state) {
2084         case TCP_SEQ_STATE_LISTENING:
2085                 if (v != SEQ_START_TOKEN)
2086                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2087                 break;
2088         case TCP_SEQ_STATE_ESTABLISHED:
2089                 if (v)
2090                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2091                 break;
2092         }
2093 }
2094
2095 int tcp_seq_open(struct inode *inode, struct file *file)
2096 {
2097         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2098         struct tcp_iter_state *s;
2099         int err;
2100
2101         err = seq_open_net(inode, file, &afinfo->seq_ops,
2102                           sizeof(struct tcp_iter_state));
2103         if (err < 0)
2104                 return err;
2105
2106         s = ((struct seq_file *)file->private_data)->private;
2107         s->family               = afinfo->family;
2108         s->last_pos             = 0;
2109         return 0;
2110 }
2111 EXPORT_SYMBOL(tcp_seq_open);
2112
2113 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2114 {
2115         int rc = 0;
2116         struct proc_dir_entry *p;
2117
2118         afinfo->seq_ops.start           = tcp_seq_start;
2119         afinfo->seq_ops.next            = tcp_seq_next;
2120         afinfo->seq_ops.stop            = tcp_seq_stop;
2121
2122         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2123                              afinfo->seq_fops, afinfo);
2124         if (!p)
2125                 rc = -ENOMEM;
2126         return rc;
2127 }
2128 EXPORT_SYMBOL(tcp_proc_register);
2129
2130 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2131 {
2132         remove_proc_entry(afinfo->name, net->proc_net);
2133 }
2134 EXPORT_SYMBOL(tcp_proc_unregister);
2135
2136 static void get_openreq4(const struct request_sock *req,
2137                          struct seq_file *f, int i)
2138 {
2139         const struct inet_request_sock *ireq = inet_rsk(req);
2140         long delta = req->rsk_timer.expires - jiffies;
2141
2142         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2143                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2144                 i,
2145                 ireq->ir_loc_addr,
2146                 ireq->ir_num,
2147                 ireq->ir_rmt_addr,
2148                 ntohs(ireq->ir_rmt_port),
2149                 TCP_SYN_RECV,
2150                 0, 0, /* could print option size, but that is af dependent. */
2151                 1,    /* timers active (only the expire timer) */
2152                 jiffies_delta_to_clock_t(delta),
2153                 req->num_timeout,
2154                 from_kuid_munged(seq_user_ns(f),
2155                                  sock_i_uid(req->rsk_listener)),
2156                 0,  /* non standard timer */
2157                 0, /* open_requests have no inode */
2158                 0,
2159                 req);
2160 }
2161
2162 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2163 {
2164         int timer_active;
2165         unsigned long timer_expires;
2166         const struct tcp_sock *tp = tcp_sk(sk);
2167         const struct inet_connection_sock *icsk = inet_csk(sk);
2168         const struct inet_sock *inet = inet_sk(sk);
2169         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2170         __be32 dest = inet->inet_daddr;
2171         __be32 src = inet->inet_rcv_saddr;
2172         __u16 destp = ntohs(inet->inet_dport);
2173         __u16 srcp = ntohs(inet->inet_sport);
2174         int rx_queue;
2175         int state;
2176
2177         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2178             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2179             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2180                 timer_active    = 1;
2181                 timer_expires   = icsk->icsk_timeout;
2182         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2183                 timer_active    = 4;
2184                 timer_expires   = icsk->icsk_timeout;
2185         } else if (timer_pending(&sk->sk_timer)) {
2186                 timer_active    = 2;
2187                 timer_expires   = sk->sk_timer.expires;
2188         } else {
2189                 timer_active    = 0;
2190                 timer_expires = jiffies;
2191         }
2192
2193         state = sk_state_load(sk);
2194         if (state == TCP_LISTEN)
2195                 rx_queue = sk->sk_ack_backlog;
2196         else
2197                 /* Because we don't lock the socket,
2198                  * we might find a transient negative value.
2199                  */
2200                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2201
2202         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2203                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2204                 i, src, srcp, dest, destp, state,
2205                 tp->write_seq - tp->snd_una,
2206                 rx_queue,
2207                 timer_active,
2208                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2209                 icsk->icsk_retransmits,
2210                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2211                 icsk->icsk_probes_out,
2212                 sock_i_ino(sk),
2213                 atomic_read(&sk->sk_refcnt), sk,
2214                 jiffies_to_clock_t(icsk->icsk_rto),
2215                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2216                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2217                 tp->snd_cwnd,
2218                 state == TCP_LISTEN ?
2219                     fastopenq->max_qlen :
2220                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2221 }
2222
2223 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2224                                struct seq_file *f, int i)
2225 {
2226         long delta = tw->tw_timer.expires - jiffies;
2227         __be32 dest, src;
2228         __u16 destp, srcp;
2229
2230         dest  = tw->tw_daddr;
2231         src   = tw->tw_rcv_saddr;
2232         destp = ntohs(tw->tw_dport);
2233         srcp  = ntohs(tw->tw_sport);
2234
2235         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2236                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2237                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2238                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2239                 atomic_read(&tw->tw_refcnt), tw);
2240 }
2241
2242 #define TMPSZ 150
2243
2244 static int tcp4_seq_show(struct seq_file *seq, void *v)
2245 {
2246         struct tcp_iter_state *st;
2247         struct sock *sk = v;
2248
2249         seq_setwidth(seq, TMPSZ - 1);
2250         if (v == SEQ_START_TOKEN) {
2251                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2252                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2253                            "inode");
2254                 goto out;
2255         }
2256         st = seq->private;
2257
2258         if (sk->sk_state == TCP_TIME_WAIT)
2259                 get_timewait4_sock(v, seq, st->num);
2260         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2261                 get_openreq4(v, seq, st->num);
2262         else
2263                 get_tcp4_sock(v, seq, st->num);
2264 out:
2265         seq_pad(seq, '\n');
2266         return 0;
2267 }
2268
2269 static const struct file_operations tcp_afinfo_seq_fops = {
2270         .owner   = THIS_MODULE,
2271         .open    = tcp_seq_open,
2272         .read    = seq_read,
2273         .llseek  = seq_lseek,
2274         .release = seq_release_net
2275 };
2276
2277 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2278         .name           = "tcp",
2279         .family         = AF_INET,
2280         .seq_fops       = &tcp_afinfo_seq_fops,
2281         .seq_ops        = {
2282                 .show           = tcp4_seq_show,
2283         },
2284 };
2285
2286 static int __net_init tcp4_proc_init_net(struct net *net)
2287 {
2288         return tcp_proc_register(net, &tcp4_seq_afinfo);
2289 }
2290
2291 static void __net_exit tcp4_proc_exit_net(struct net *net)
2292 {
2293         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2294 }
2295
2296 static struct pernet_operations tcp4_net_ops = {
2297         .init = tcp4_proc_init_net,
2298         .exit = tcp4_proc_exit_net,
2299 };
2300
2301 int __init tcp4_proc_init(void)
2302 {
2303         return register_pernet_subsys(&tcp4_net_ops);
2304 }
2305
2306 void tcp4_proc_exit(void)
2307 {
2308         unregister_pernet_subsys(&tcp4_net_ops);
2309 }
2310 #endif /* CONFIG_PROC_FS */
2311
2312 struct proto tcp_prot = {
2313         .name                   = "TCP",
2314         .owner                  = THIS_MODULE,
2315         .close                  = tcp_close,
2316         .connect                = tcp_v4_connect,
2317         .disconnect             = tcp_disconnect,
2318         .accept                 = inet_csk_accept,
2319         .ioctl                  = tcp_ioctl,
2320         .init                   = tcp_v4_init_sock,
2321         .destroy                = tcp_v4_destroy_sock,
2322         .shutdown               = tcp_shutdown,
2323         .setsockopt             = tcp_setsockopt,
2324         .getsockopt             = tcp_getsockopt,
2325         .recvmsg                = tcp_recvmsg,
2326         .sendmsg                = tcp_sendmsg,
2327         .sendpage               = tcp_sendpage,
2328         .backlog_rcv            = tcp_v4_do_rcv,
2329         .release_cb             = tcp_release_cb,
2330         .hash                   = inet_hash,
2331         .unhash                 = inet_unhash,
2332         .get_port               = inet_csk_get_port,
2333         .enter_memory_pressure  = tcp_enter_memory_pressure,
2334         .stream_memory_free     = tcp_stream_memory_free,
2335         .sockets_allocated      = &tcp_sockets_allocated,
2336         .orphan_count           = &tcp_orphan_count,
2337         .memory_allocated       = &tcp_memory_allocated,
2338         .memory_pressure        = &tcp_memory_pressure,
2339         .sysctl_mem             = sysctl_tcp_mem,
2340         .sysctl_wmem            = sysctl_tcp_wmem,
2341         .sysctl_rmem            = sysctl_tcp_rmem,
2342         .max_header             = MAX_TCP_HEADER,
2343         .obj_size               = sizeof(struct tcp_sock),
2344         .slab_flags             = SLAB_DESTROY_BY_RCU,
2345         .twsk_prot              = &tcp_timewait_sock_ops,
2346         .rsk_prot               = &tcp_request_sock_ops,
2347         .h.hashinfo             = &tcp_hashinfo,
2348         .no_autobind            = true,
2349 #ifdef CONFIG_COMPAT
2350         .compat_setsockopt      = compat_tcp_setsockopt,
2351         .compat_getsockopt      = compat_tcp_getsockopt,
2352 #endif
2353 #ifdef CONFIG_MEMCG_KMEM
2354         .init_cgroup            = tcp_init_cgroup,
2355         .destroy_cgroup         = tcp_destroy_cgroup,
2356         .proto_cgroup           = tcp_proto_cgroup,
2357 #endif
2358         .diag_destroy           = tcp_abort,
2359 };
2360 EXPORT_SYMBOL(tcp_prot);
2361
2362 static void __net_exit tcp_sk_exit(struct net *net)
2363 {
2364         int cpu;
2365
2366         for_each_possible_cpu(cpu)
2367                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2368         free_percpu(net->ipv4.tcp_sk);
2369 }
2370
2371 static int __net_init tcp_sk_init(struct net *net)
2372 {
2373         int res, cpu;
2374
2375         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2376         if (!net->ipv4.tcp_sk)
2377                 return -ENOMEM;
2378
2379         for_each_possible_cpu(cpu) {
2380                 struct sock *sk;
2381
2382                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2383                                            IPPROTO_TCP, net);
2384                 if (res)
2385                         goto fail;
2386                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2387         }
2388
2389         net->ipv4.sysctl_tcp_ecn = 2;
2390         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2391
2392         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2393         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2394         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2395
2396         return 0;
2397 fail:
2398         tcp_sk_exit(net);
2399
2400         return res;
2401 }
2402
2403 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2404 {
2405         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2406 }
2407
2408 static struct pernet_operations __net_initdata tcp_sk_ops = {
2409        .init       = tcp_sk_init,
2410        .exit       = tcp_sk_exit,
2411        .exit_batch = tcp_sk_exit_batch,
2412 };
2413
2414 void __init tcp_v4_init(void)
2415 {
2416         inet_hashinfo_init(&tcp_hashinfo);
2417         if (register_pernet_subsys(&tcp_sk_ops))
2418                 panic("Failed to create the TCP control socket.\n");
2419 }