net: Don't proxy arp respond if iif == rt->dst.dev if private VLAN is disabled
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 #include <net/secure_seq.h>
76
77 #include <linux/inet.h>
78 #include <linux/ipv6.h>
79 #include <linux/stddef.h>
80 #include <linux/proc_fs.h>
81 #include <linux/seq_file.h>
82
83 #include <linux/crypto.h>
84 #include <linux/scatterlist.h>
85
86 int sysctl_tcp_tw_reuse __read_mostly;
87 int sysctl_tcp_low_latency __read_mostly;
88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
89
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93                                                    __be32 addr);
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
96 #else
97 static inline
98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
99 {
100         return NULL;
101 }
102 #endif
103
104 struct inet_hashinfo tcp_hashinfo;
105 EXPORT_SYMBOL(tcp_hashinfo);
106
107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
108 {
109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
110                                           ip_hdr(skb)->saddr,
111                                           tcp_hdr(skb)->dest,
112                                           tcp_hdr(skb)->source);
113 }
114
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116 {
117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118         struct tcp_sock *tp = tcp_sk(sk);
119
120         /* With PAWS, it is safe from the viewpoint
121            of data integrity. Even without PAWS it is safe provided sequence
122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
123
124            Actually, the idea is close to VJ's one, only timestamp cache is
125            held not per host, but per port pair and TW bucket is used as state
126            holder.
127
128            If TW bucket has been already destroyed we fall back to VJ's scheme
129            and use initial timestamp retrieved from peer table.
130          */
131         if (tcptw->tw_ts_recent_stamp &&
132             (twp == NULL || (sysctl_tcp_tw_reuse &&
133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135                 if (tp->write_seq == 0)
136                         tp->write_seq = 1;
137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139                 sock_hold(sktw);
140                 return 1;
141         }
142
143         return 0;
144 }
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151         struct inet_sock *inet = inet_sk(sk);
152         struct tcp_sock *tp = tcp_sk(sk);
153         __be16 orig_sport, orig_dport;
154         __be32 daddr, nexthop;
155         struct flowi4 *fl4;
156         struct rtable *rt;
157         int err;
158         struct ip_options_rcu *inet_opt;
159
160         if (addr_len < sizeof(struct sockaddr_in))
161                 return -EINVAL;
162
163         if (usin->sin_family != AF_INET)
164                 return -EAFNOSUPPORT;
165
166         nexthop = daddr = usin->sin_addr.s_addr;
167         inet_opt = rcu_dereference_protected(inet->inet_opt,
168                                              sock_owned_by_user(sk));
169         if (inet_opt && inet_opt->opt.srr) {
170                 if (!daddr)
171                         return -EINVAL;
172                 nexthop = inet_opt->opt.faddr;
173         }
174
175         orig_sport = inet->inet_sport;
176         orig_dport = usin->sin_port;
177         fl4 = &inet->cork.fl.u.ip4;
178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
180                               IPPROTO_TCP,
181                               orig_sport, orig_dport, sk, true);
182         if (IS_ERR(rt)) {
183                 err = PTR_ERR(rt);
184                 if (err == -ENETUNREACH)
185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
186                 return err;
187         }
188
189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
190                 ip_rt_put(rt);
191                 return -ENETUNREACH;
192         }
193
194         if (!inet_opt || !inet_opt->opt.srr)
195                 daddr = fl4->daddr;
196
197         if (!inet->inet_saddr)
198                 inet->inet_saddr = fl4->saddr;
199         inet->inet_rcv_saddr = inet->inet_saddr;
200
201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
202                 /* Reset inherited state */
203                 tp->rx_opt.ts_recent       = 0;
204                 tp->rx_opt.ts_recent_stamp = 0;
205                 tp->write_seq              = 0;
206         }
207
208         if (tcp_death_row.sysctl_tw_recycle &&
209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
211                 /*
212                  * VJ's idea. We save last timestamp seen from
213                  * the destination in peer table, when entering state
214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
215                  * when trying new connection.
216                  */
217                 if (peer) {
218                         inet_peer_refcheck(peer);
219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
222                         }
223                 }
224         }
225
226         inet->inet_dport = usin->sin_port;
227         inet->inet_daddr = daddr;
228
229         inet_csk(sk)->icsk_ext_hdr_len = 0;
230         if (inet_opt)
231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
232
233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
234
235         /* Socket identity is still unknown (sport may be zero).
236          * However we set state to SYN-SENT and not releasing socket
237          * lock select source port, enter ourselves into the hash tables and
238          * complete initialization after this.
239          */
240         tcp_set_state(sk, TCP_SYN_SENT);
241         err = inet_hash_connect(&tcp_death_row, sk);
242         if (err)
243                 goto failure;
244
245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
246                                inet->inet_sport, inet->inet_dport, sk);
247         if (IS_ERR(rt)) {
248                 err = PTR_ERR(rt);
249                 rt = NULL;
250                 goto failure;
251         }
252         /* OK, now commit destination to socket.  */
253         sk->sk_gso_type = SKB_GSO_TCPV4;
254         sk_setup_caps(sk, &rt->dst);
255
256         if (!tp->write_seq)
257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
258                                                            inet->inet_daddr,
259                                                            inet->inet_sport,
260                                                            usin->sin_port);
261
262         inet->inet_id = tp->write_seq ^ jiffies;
263
264         err = tcp_connect(sk);
265         rt = NULL;
266         if (err)
267                 goto failure;
268
269         return 0;
270
271 failure:
272         /*
273          * This unhashes the socket and releases the local port,
274          * if necessary.
275          */
276         tcp_set_state(sk, TCP_CLOSE);
277         ip_rt_put(rt);
278         sk->sk_route_caps = 0;
279         inet->inet_dport = 0;
280         return err;
281 }
282 EXPORT_SYMBOL(tcp_v4_connect);
283
284 /*
285  * This routine does path mtu discovery as defined in RFC1191.
286  */
287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
288 {
289         struct dst_entry *dst;
290         struct inet_sock *inet = inet_sk(sk);
291
292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
293          * send out by Linux are always <576bytes so they should go through
294          * unfragmented).
295          */
296         if (sk->sk_state == TCP_LISTEN)
297                 return;
298
299         /* We don't check in the destentry if pmtu discovery is forbidden
300          * on this route. We just assume that no packet_to_big packets
301          * are send back when pmtu discovery is not active.
302          * There is a small race when the user changes this flag in the
303          * route, but I think that's acceptable.
304          */
305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
306                 return;
307
308         dst->ops->update_pmtu(dst, mtu);
309
310         /* Something is about to be wrong... Remember soft error
311          * for the case, if this connection will not able to recover.
312          */
313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
314                 sk->sk_err_soft = EMSGSIZE;
315
316         mtu = dst_mtu(dst);
317
318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
320                 tcp_sync_mss(sk, mtu);
321
322                 /* Resend the TCP packet because it's
323                  * clear that the old packet has been
324                  * dropped. This is the new "fast" path mtu
325                  * discovery.
326                  */
327                 tcp_simple_retransmit(sk);
328         } /* else let the usual retransmit timer handle it */
329 }
330
331 /*
332  * This routine is called by the ICMP module when it gets some
333  * sort of error condition.  If err < 0 then the socket should
334  * be closed and the error returned to the user.  If err > 0
335  * it's just the icmp type << 8 | icmp code.  After adjustment
336  * header points to the first 8 bytes of the tcp header.  We need
337  * to find the appropriate port.
338  *
339  * The locking strategy used here is very "optimistic". When
340  * someone else accesses the socket the ICMP is just dropped
341  * and for some paths there is no check at all.
342  * A more general error queue to queue errors for later handling
343  * is probably better.
344  *
345  */
346
347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
348 {
349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
351         struct inet_connection_sock *icsk;
352         struct tcp_sock *tp;
353         struct inet_sock *inet;
354         const int type = icmp_hdr(icmp_skb)->type;
355         const int code = icmp_hdr(icmp_skb)->code;
356         struct sock *sk;
357         struct sk_buff *skb;
358         __u32 seq;
359         __u32 remaining;
360         int err;
361         struct net *net = dev_net(icmp_skb->dev);
362
363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365                 return;
366         }
367
368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
369                         iph->saddr, th->source, inet_iif(icmp_skb));
370         if (!sk) {
371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
372                 return;
373         }
374         if (sk->sk_state == TCP_TIME_WAIT) {
375                 inet_twsk_put(inet_twsk(sk));
376                 return;
377         }
378
379         bh_lock_sock(sk);
380         /* If too many ICMPs get dropped on busy
381          * servers this needs to be solved differently.
382          */
383         if (sock_owned_by_user(sk))
384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
385
386         if (sk->sk_state == TCP_CLOSE)
387                 goto out;
388
389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
391                 goto out;
392         }
393
394         icsk = inet_csk(sk);
395         tp = tcp_sk(sk);
396         seq = ntohl(th->seq);
397         if (sk->sk_state != TCP_LISTEN &&
398             !between(seq, tp->snd_una, tp->snd_nxt)) {
399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
400                 goto out;
401         }
402
403         switch (type) {
404         case ICMP_SOURCE_QUENCH:
405                 /* Just silently ignore these. */
406                 goto out;
407         case ICMP_PARAMETERPROB:
408                 err = EPROTO;
409                 break;
410         case ICMP_DEST_UNREACH:
411                 if (code > NR_ICMP_UNREACH)
412                         goto out;
413
414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
415                         if (!sock_owned_by_user(sk))
416                                 do_pmtu_discovery(sk, iph, info);
417                         goto out;
418                 }
419
420                 err = icmp_err_convert[code].errno;
421                 /* check if icmp_skb allows revert of backoff
422                  * (see draft-zimmermann-tcp-lcd) */
423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
424                         break;
425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
426                     !icsk->icsk_backoff)
427                         break;
428
429                 if (sock_owned_by_user(sk))
430                         break;
431
432                 icsk->icsk_backoff--;
433                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
434                                          icsk->icsk_backoff;
435                 tcp_bound_rto(sk);
436
437                 skb = tcp_write_queue_head(sk);
438                 BUG_ON(!skb);
439
440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
442
443                 if (remaining) {
444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445                                                   remaining, TCP_RTO_MAX);
446                 } else {
447                         /* RTO revert clocked out retransmission.
448                          * Will retransmit now */
449                         tcp_retransmit_timer(sk);
450                 }
451
452                 break;
453         case ICMP_TIME_EXCEEDED:
454                 err = EHOSTUNREACH;
455                 break;
456         default:
457                 goto out;
458         }
459
460         switch (sk->sk_state) {
461                 struct request_sock *req, **prev;
462         case TCP_LISTEN:
463                 if (sock_owned_by_user(sk))
464                         goto out;
465
466                 req = inet_csk_search_req(sk, &prev, th->dest,
467                                           iph->daddr, iph->saddr);
468                 if (!req)
469                         goto out;
470
471                 /* ICMPs are not backlogged, hence we cannot get
472                    an established socket here.
473                  */
474                 WARN_ON(req->sk);
475
476                 if (seq != tcp_rsk(req)->snt_isn) {
477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478                         goto out;
479                 }
480
481                 /*
482                  * Still in SYN_RECV, just remove it silently.
483                  * There is no good way to pass the error to the newly
484                  * created socket, and POSIX does not want network
485                  * errors returned from accept().
486                  */
487                 inet_csk_reqsk_queue_drop(sk, req, prev);
488                 goto out;
489
490         case TCP_SYN_SENT:
491         case TCP_SYN_RECV:  /* Cannot happen.
492                                It can f.e. if SYNs crossed.
493                              */
494                 if (!sock_owned_by_user(sk)) {
495                         sk->sk_err = err;
496
497                         sk->sk_error_report(sk);
498
499                         tcp_done(sk);
500                 } else {
501                         sk->sk_err_soft = err;
502                 }
503                 goto out;
504         }
505
506         /* If we've already connected we will keep trying
507          * until we time out, or the user gives up.
508          *
509          * rfc1122 4.2.3.9 allows to consider as hard errors
510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
511          * but it is obsoleted by pmtu discovery).
512          *
513          * Note, that in modern internet, where routing is unreliable
514          * and in each dark corner broken firewalls sit, sending random
515          * errors ordered by their masters even this two messages finally lose
516          * their original sense (even Linux sends invalid PORT_UNREACHs)
517          *
518          * Now we are in compliance with RFCs.
519          *                                                      --ANK (980905)
520          */
521
522         inet = inet_sk(sk);
523         if (!sock_owned_by_user(sk) && inet->recverr) {
524                 sk->sk_err = err;
525                 sk->sk_error_report(sk);
526         } else  { /* Only an error on timeout */
527                 sk->sk_err_soft = err;
528         }
529
530 out:
531         bh_unlock_sock(sk);
532         sock_put(sk);
533 }
534
535 static void __tcp_v4_send_check(struct sk_buff *skb,
536                                 __be32 saddr, __be32 daddr)
537 {
538         struct tcphdr *th = tcp_hdr(skb);
539
540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
542                 skb->csum_start = skb_transport_header(skb) - skb->head;
543                 skb->csum_offset = offsetof(struct tcphdr, check);
544         } else {
545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
546                                          csum_partial(th,
547                                                       th->doff << 2,
548                                                       skb->csum));
549         }
550 }
551
552 /* This routine computes an IPv4 TCP checksum. */
553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
554 {
555         struct inet_sock *inet = inet_sk(sk);
556
557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
558 }
559 EXPORT_SYMBOL(tcp_v4_send_check);
560
561 int tcp_v4_gso_send_check(struct sk_buff *skb)
562 {
563         const struct iphdr *iph;
564         struct tcphdr *th;
565
566         if (!pskb_may_pull(skb, sizeof(*th)))
567                 return -EINVAL;
568
569         iph = ip_hdr(skb);
570         th = tcp_hdr(skb);
571
572         th->check = 0;
573         skb->ip_summed = CHECKSUM_PARTIAL;
574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
575         return 0;
576 }
577
578 /*
579  *      This routine will send an RST to the other tcp.
580  *
581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
582  *                    for reset.
583  *      Answer: if a packet caused RST, it is not for a socket
584  *              existing in our system, if it is matched to a socket,
585  *              it is just duplicate segment or bug in other side's TCP.
586  *              So that we build reply only basing on parameters
587  *              arrived with segment.
588  *      Exception: precedence violation. We do not implement it in any case.
589  */
590
591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
592 {
593         struct tcphdr *th = tcp_hdr(skb);
594         struct {
595                 struct tcphdr th;
596 #ifdef CONFIG_TCP_MD5SIG
597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
598 #endif
599         } rep;
600         struct ip_reply_arg arg;
601 #ifdef CONFIG_TCP_MD5SIG
602         struct tcp_md5sig_key *key;
603 #endif
604         struct net *net;
605
606         /* Never send a reset in response to a reset. */
607         if (th->rst)
608                 return;
609
610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
611                 return;
612
613         /* Swap the send and the receive. */
614         memset(&rep, 0, sizeof(rep));
615         rep.th.dest   = th->source;
616         rep.th.source = th->dest;
617         rep.th.doff   = sizeof(struct tcphdr) / 4;
618         rep.th.rst    = 1;
619
620         if (th->ack) {
621                 rep.th.seq = th->ack_seq;
622         } else {
623                 rep.th.ack = 1;
624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625                                        skb->len - (th->doff << 2));
626         }
627
628         memset(&arg, 0, sizeof(arg));
629         arg.iov[0].iov_base = (unsigned char *)&rep;
630         arg.iov[0].iov_len  = sizeof(rep.th);
631
632 #ifdef CONFIG_TCP_MD5SIG
633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
634         if (key) {
635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
636                                    (TCPOPT_NOP << 16) |
637                                    (TCPOPT_MD5SIG << 8) |
638                                    TCPOLEN_MD5SIG);
639                 /* Update length and the length the header thinks exists */
640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
641                 rep.th.doff = arg.iov[0].iov_len / 4;
642
643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
644                                      key, ip_hdr(skb)->saddr,
645                                      ip_hdr(skb)->daddr, &rep.th);
646         }
647 #endif
648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
649                                       ip_hdr(skb)->saddr, /* XXX */
650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
653
654         net = dev_net(skb_dst(skb)->dev);
655         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
656                       &arg, arg.iov[0].iov_len);
657
658         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
659         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
660 }
661
662 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
663    outside socket context is ugly, certainly. What can I do?
664  */
665
666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
667                             u32 win, u32 ts, int oif,
668                             struct tcp_md5sig_key *key,
669                             int reply_flags)
670 {
671         struct tcphdr *th = tcp_hdr(skb);
672         struct {
673                 struct tcphdr th;
674                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
675 #ifdef CONFIG_TCP_MD5SIG
676                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
677 #endif
678                         ];
679         } rep;
680         struct ip_reply_arg arg;
681         struct net *net = dev_net(skb_dst(skb)->dev);
682
683         memset(&rep.th, 0, sizeof(struct tcphdr));
684         memset(&arg, 0, sizeof(arg));
685
686         arg.iov[0].iov_base = (unsigned char *)&rep;
687         arg.iov[0].iov_len  = sizeof(rep.th);
688         if (ts) {
689                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
690                                    (TCPOPT_TIMESTAMP << 8) |
691                                    TCPOLEN_TIMESTAMP);
692                 rep.opt[1] = htonl(tcp_time_stamp);
693                 rep.opt[2] = htonl(ts);
694                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
695         }
696
697         /* Swap the send and the receive. */
698         rep.th.dest    = th->source;
699         rep.th.source  = th->dest;
700         rep.th.doff    = arg.iov[0].iov_len / 4;
701         rep.th.seq     = htonl(seq);
702         rep.th.ack_seq = htonl(ack);
703         rep.th.ack     = 1;
704         rep.th.window  = htons(win);
705
706 #ifdef CONFIG_TCP_MD5SIG
707         if (key) {
708                 int offset = (ts) ? 3 : 0;
709
710                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
711                                           (TCPOPT_NOP << 16) |
712                                           (TCPOPT_MD5SIG << 8) |
713                                           TCPOLEN_MD5SIG);
714                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
715                 rep.th.doff = arg.iov[0].iov_len/4;
716
717                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
718                                     key, ip_hdr(skb)->saddr,
719                                     ip_hdr(skb)->daddr, &rep.th);
720         }
721 #endif
722         arg.flags = reply_flags;
723         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
724                                       ip_hdr(skb)->saddr, /* XXX */
725                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
726         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
727         if (oif)
728                 arg.bound_dev_if = oif;
729
730         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
731                       &arg, arg.iov[0].iov_len);
732
733         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
734 }
735
736 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
737 {
738         struct inet_timewait_sock *tw = inet_twsk(sk);
739         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
740
741         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
742                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
743                         tcptw->tw_ts_recent,
744                         tw->tw_bound_dev_if,
745                         tcp_twsk_md5_key(tcptw),
746                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
747                         );
748
749         inet_twsk_put(tw);
750 }
751
752 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
753                                   struct request_sock *req)
754 {
755         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
756                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
757                         req->ts_recent,
758                         0,
759                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
760                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
761 }
762
763 /*
764  *      Send a SYN-ACK after having received a SYN.
765  *      This still operates on a request_sock only, not on a big
766  *      socket.
767  */
768 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
769                               struct request_sock *req,
770                               struct request_values *rvp)
771 {
772         const struct inet_request_sock *ireq = inet_rsk(req);
773         struct flowi4 fl4;
774         int err = -1;
775         struct sk_buff * skb;
776
777         /* First, grab a route. */
778         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
779                 return -1;
780
781         skb = tcp_make_synack(sk, dst, req, rvp);
782
783         if (skb) {
784                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
785
786                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
787                                             ireq->rmt_addr,
788                                             ireq->opt);
789                 err = net_xmit_eval(err);
790         }
791
792         dst_release(dst);
793         return err;
794 }
795
796 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
797                               struct request_values *rvp)
798 {
799         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
800         return tcp_v4_send_synack(sk, NULL, req, rvp);
801 }
802
803 /*
804  *      IPv4 request_sock destructor.
805  */
806 static void tcp_v4_reqsk_destructor(struct request_sock *req)
807 {
808         kfree(inet_rsk(req)->opt);
809 }
810
811 static void syn_flood_warning(const struct sk_buff *skb)
812 {
813         const char *msg;
814
815 #ifdef CONFIG_SYN_COOKIES
816         if (sysctl_tcp_syncookies)
817                 msg = "Sending cookies";
818         else
819 #endif
820                 msg = "Dropping request";
821
822         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
823                                 ntohs(tcp_hdr(skb)->dest), msg);
824 }
825
826 /*
827  * Save and compile IPv4 options into the request_sock if needed.
828  */
829 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
830                                                   struct sk_buff *skb)
831 {
832         const struct ip_options *opt = &(IPCB(skb)->opt);
833         struct ip_options_rcu *dopt = NULL;
834
835         if (opt && opt->optlen) {
836                 int opt_size = sizeof(*dopt) + opt->optlen;
837
838                 dopt = kmalloc(opt_size, GFP_ATOMIC);
839                 if (dopt) {
840                         if (ip_options_echo(&dopt->opt, skb)) {
841                                 kfree(dopt);
842                                 dopt = NULL;
843                         }
844                 }
845         }
846         return dopt;
847 }
848
849 #ifdef CONFIG_TCP_MD5SIG
850 /*
851  * RFC2385 MD5 checksumming requires a mapping of
852  * IP address->MD5 Key.
853  * We need to maintain these in the sk structure.
854  */
855
856 /* Find the Key structure for an address.  */
857 static struct tcp_md5sig_key *
858                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
859 {
860         struct tcp_sock *tp = tcp_sk(sk);
861         int i;
862
863         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
864                 return NULL;
865         for (i = 0; i < tp->md5sig_info->entries4; i++) {
866                 if (tp->md5sig_info->keys4[i].addr == addr)
867                         return &tp->md5sig_info->keys4[i].base;
868         }
869         return NULL;
870 }
871
872 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
873                                          struct sock *addr_sk)
874 {
875         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
876 }
877 EXPORT_SYMBOL(tcp_v4_md5_lookup);
878
879 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
880                                                       struct request_sock *req)
881 {
882         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
883 }
884
885 /* This can be called on a newly created socket, from other files */
886 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
887                       u8 *newkey, u8 newkeylen)
888 {
889         /* Add Key to the list */
890         struct tcp_md5sig_key *key;
891         struct tcp_sock *tp = tcp_sk(sk);
892         struct tcp4_md5sig_key *keys;
893
894         key = tcp_v4_md5_do_lookup(sk, addr);
895         if (key) {
896                 /* Pre-existing entry - just update that one. */
897                 kfree(key->key);
898                 key->key = newkey;
899                 key->keylen = newkeylen;
900         } else {
901                 struct tcp_md5sig_info *md5sig;
902
903                 if (!tp->md5sig_info) {
904                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
905                                                   GFP_ATOMIC);
906                         if (!tp->md5sig_info) {
907                                 kfree(newkey);
908                                 return -ENOMEM;
909                         }
910                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
911                 }
912
913                 md5sig = tp->md5sig_info;
914                 if (md5sig->entries4 == 0 &&
915                     tcp_alloc_md5sig_pool(sk) == NULL) {
916                         kfree(newkey);
917                         return -ENOMEM;
918                 }
919
920                 if (md5sig->alloced4 == md5sig->entries4) {
921                         keys = kmalloc((sizeof(*keys) *
922                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
923                         if (!keys) {
924                                 kfree(newkey);
925                                 if (md5sig->entries4 == 0)
926                                         tcp_free_md5sig_pool();
927                                 return -ENOMEM;
928                         }
929
930                         if (md5sig->entries4)
931                                 memcpy(keys, md5sig->keys4,
932                                        sizeof(*keys) * md5sig->entries4);
933
934                         /* Free old key list, and reference new one */
935                         kfree(md5sig->keys4);
936                         md5sig->keys4 = keys;
937                         md5sig->alloced4++;
938                 }
939                 md5sig->entries4++;
940                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
941                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
942                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
943         }
944         return 0;
945 }
946 EXPORT_SYMBOL(tcp_v4_md5_do_add);
947
948 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
949                                u8 *newkey, u8 newkeylen)
950 {
951         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
952                                  newkey, newkeylen);
953 }
954
955 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
956 {
957         struct tcp_sock *tp = tcp_sk(sk);
958         int i;
959
960         for (i = 0; i < tp->md5sig_info->entries4; i++) {
961                 if (tp->md5sig_info->keys4[i].addr == addr) {
962                         /* Free the key */
963                         kfree(tp->md5sig_info->keys4[i].base.key);
964                         tp->md5sig_info->entries4--;
965
966                         if (tp->md5sig_info->entries4 == 0) {
967                                 kfree(tp->md5sig_info->keys4);
968                                 tp->md5sig_info->keys4 = NULL;
969                                 tp->md5sig_info->alloced4 = 0;
970                                 tcp_free_md5sig_pool();
971                         } else if (tp->md5sig_info->entries4 != i) {
972                                 /* Need to do some manipulation */
973                                 memmove(&tp->md5sig_info->keys4[i],
974                                         &tp->md5sig_info->keys4[i+1],
975                                         (tp->md5sig_info->entries4 - i) *
976                                          sizeof(struct tcp4_md5sig_key));
977                         }
978                         return 0;
979                 }
980         }
981         return -ENOENT;
982 }
983 EXPORT_SYMBOL(tcp_v4_md5_do_del);
984
985 static void tcp_v4_clear_md5_list(struct sock *sk)
986 {
987         struct tcp_sock *tp = tcp_sk(sk);
988
989         /* Free each key, then the set of key keys,
990          * the crypto element, and then decrement our
991          * hold on the last resort crypto.
992          */
993         if (tp->md5sig_info->entries4) {
994                 int i;
995                 for (i = 0; i < tp->md5sig_info->entries4; i++)
996                         kfree(tp->md5sig_info->keys4[i].base.key);
997                 tp->md5sig_info->entries4 = 0;
998                 tcp_free_md5sig_pool();
999         }
1000         if (tp->md5sig_info->keys4) {
1001                 kfree(tp->md5sig_info->keys4);
1002                 tp->md5sig_info->keys4 = NULL;
1003                 tp->md5sig_info->alloced4  = 0;
1004         }
1005 }
1006
1007 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1008                                  int optlen)
1009 {
1010         struct tcp_md5sig cmd;
1011         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1012         u8 *newkey;
1013
1014         if (optlen < sizeof(cmd))
1015                 return -EINVAL;
1016
1017         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1018                 return -EFAULT;
1019
1020         if (sin->sin_family != AF_INET)
1021                 return -EINVAL;
1022
1023         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1024                 if (!tcp_sk(sk)->md5sig_info)
1025                         return -ENOENT;
1026                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1027         }
1028
1029         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1030                 return -EINVAL;
1031
1032         if (!tcp_sk(sk)->md5sig_info) {
1033                 struct tcp_sock *tp = tcp_sk(sk);
1034                 struct tcp_md5sig_info *p;
1035
1036                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1037                 if (!p)
1038                         return -EINVAL;
1039
1040                 tp->md5sig_info = p;
1041                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1042         }
1043
1044         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1045         if (!newkey)
1046                 return -ENOMEM;
1047         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1048                                  newkey, cmd.tcpm_keylen);
1049 }
1050
1051 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1052                                         __be32 daddr, __be32 saddr, int nbytes)
1053 {
1054         struct tcp4_pseudohdr *bp;
1055         struct scatterlist sg;
1056
1057         bp = &hp->md5_blk.ip4;
1058
1059         /*
1060          * 1. the TCP pseudo-header (in the order: source IP address,
1061          * destination IP address, zero-padded protocol number, and
1062          * segment length)
1063          */
1064         bp->saddr = saddr;
1065         bp->daddr = daddr;
1066         bp->pad = 0;
1067         bp->protocol = IPPROTO_TCP;
1068         bp->len = cpu_to_be16(nbytes);
1069
1070         sg_init_one(&sg, bp, sizeof(*bp));
1071         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1072 }
1073
1074 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1075                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1076 {
1077         struct tcp_md5sig_pool *hp;
1078         struct hash_desc *desc;
1079
1080         hp = tcp_get_md5sig_pool();
1081         if (!hp)
1082                 goto clear_hash_noput;
1083         desc = &hp->md5_desc;
1084
1085         if (crypto_hash_init(desc))
1086                 goto clear_hash;
1087         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1088                 goto clear_hash;
1089         if (tcp_md5_hash_header(hp, th))
1090                 goto clear_hash;
1091         if (tcp_md5_hash_key(hp, key))
1092                 goto clear_hash;
1093         if (crypto_hash_final(desc, md5_hash))
1094                 goto clear_hash;
1095
1096         tcp_put_md5sig_pool();
1097         return 0;
1098
1099 clear_hash:
1100         tcp_put_md5sig_pool();
1101 clear_hash_noput:
1102         memset(md5_hash, 0, 16);
1103         return 1;
1104 }
1105
1106 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1107                         struct sock *sk, struct request_sock *req,
1108                         struct sk_buff *skb)
1109 {
1110         struct tcp_md5sig_pool *hp;
1111         struct hash_desc *desc;
1112         struct tcphdr *th = tcp_hdr(skb);
1113         __be32 saddr, daddr;
1114
1115         if (sk) {
1116                 saddr = inet_sk(sk)->inet_saddr;
1117                 daddr = inet_sk(sk)->inet_daddr;
1118         } else if (req) {
1119                 saddr = inet_rsk(req)->loc_addr;
1120                 daddr = inet_rsk(req)->rmt_addr;
1121         } else {
1122                 const struct iphdr *iph = ip_hdr(skb);
1123                 saddr = iph->saddr;
1124                 daddr = iph->daddr;
1125         }
1126
1127         hp = tcp_get_md5sig_pool();
1128         if (!hp)
1129                 goto clear_hash_noput;
1130         desc = &hp->md5_desc;
1131
1132         if (crypto_hash_init(desc))
1133                 goto clear_hash;
1134
1135         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1136                 goto clear_hash;
1137         if (tcp_md5_hash_header(hp, th))
1138                 goto clear_hash;
1139         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1140                 goto clear_hash;
1141         if (tcp_md5_hash_key(hp, key))
1142                 goto clear_hash;
1143         if (crypto_hash_final(desc, md5_hash))
1144                 goto clear_hash;
1145
1146         tcp_put_md5sig_pool();
1147         return 0;
1148
1149 clear_hash:
1150         tcp_put_md5sig_pool();
1151 clear_hash_noput:
1152         memset(md5_hash, 0, 16);
1153         return 1;
1154 }
1155 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1156
1157 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1158 {
1159         /*
1160          * This gets called for each TCP segment that arrives
1161          * so we want to be efficient.
1162          * We have 3 drop cases:
1163          * o No MD5 hash and one expected.
1164          * o MD5 hash and we're not expecting one.
1165          * o MD5 hash and its wrong.
1166          */
1167         __u8 *hash_location = NULL;
1168         struct tcp_md5sig_key *hash_expected;
1169         const struct iphdr *iph = ip_hdr(skb);
1170         struct tcphdr *th = tcp_hdr(skb);
1171         int genhash;
1172         unsigned char newhash[16];
1173
1174         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1175         hash_location = tcp_parse_md5sig_option(th);
1176
1177         /* We've parsed the options - do we have a hash? */
1178         if (!hash_expected && !hash_location)
1179                 return 0;
1180
1181         if (hash_expected && !hash_location) {
1182                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1183                 return 1;
1184         }
1185
1186         if (!hash_expected && hash_location) {
1187                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1188                 return 1;
1189         }
1190
1191         /* Okay, so this is hash_expected and hash_location -
1192          * so we need to calculate the checksum.
1193          */
1194         genhash = tcp_v4_md5_hash_skb(newhash,
1195                                       hash_expected,
1196                                       NULL, NULL, skb);
1197
1198         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1199                 if (net_ratelimit()) {
1200                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1201                                &iph->saddr, ntohs(th->source),
1202                                &iph->daddr, ntohs(th->dest),
1203                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1204                 }
1205                 return 1;
1206         }
1207         return 0;
1208 }
1209
1210 #endif
1211
1212 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1213         .family         =       PF_INET,
1214         .obj_size       =       sizeof(struct tcp_request_sock),
1215         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1216         .send_ack       =       tcp_v4_reqsk_send_ack,
1217         .destructor     =       tcp_v4_reqsk_destructor,
1218         .send_reset     =       tcp_v4_send_reset,
1219         .syn_ack_timeout =      tcp_syn_ack_timeout,
1220 };
1221
1222 #ifdef CONFIG_TCP_MD5SIG
1223 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1224         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1225         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1226 };
1227 #endif
1228
1229 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1230 {
1231         struct tcp_extend_values tmp_ext;
1232         struct tcp_options_received tmp_opt;
1233         u8 *hash_location;
1234         struct request_sock *req;
1235         struct inet_request_sock *ireq;
1236         struct tcp_sock *tp = tcp_sk(sk);
1237         struct dst_entry *dst = NULL;
1238         __be32 saddr = ip_hdr(skb)->saddr;
1239         __be32 daddr = ip_hdr(skb)->daddr;
1240         __u32 isn = TCP_SKB_CB(skb)->when;
1241 #ifdef CONFIG_SYN_COOKIES
1242         int want_cookie = 0;
1243 #else
1244 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1245 #endif
1246
1247         /* Never answer to SYNs send to broadcast or multicast */
1248         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1249                 goto drop;
1250
1251         /* TW buckets are converted to open requests without
1252          * limitations, they conserve resources and peer is
1253          * evidently real one.
1254          */
1255         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1256                 if (net_ratelimit())
1257                         syn_flood_warning(skb);
1258 #ifdef CONFIG_SYN_COOKIES
1259                 if (sysctl_tcp_syncookies) {
1260                         want_cookie = 1;
1261                 } else
1262 #endif
1263                 goto drop;
1264         }
1265
1266         /* Accept backlog is full. If we have already queued enough
1267          * of warm entries in syn queue, drop request. It is better than
1268          * clogging syn queue with openreqs with exponentially increasing
1269          * timeout.
1270          */
1271         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1272                 goto drop;
1273
1274         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1275         if (!req)
1276                 goto drop;
1277
1278 #ifdef CONFIG_TCP_MD5SIG
1279         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1280 #endif
1281
1282         tcp_clear_options(&tmp_opt);
1283         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1284         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1285         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1286
1287         if (tmp_opt.cookie_plus > 0 &&
1288             tmp_opt.saw_tstamp &&
1289             !tp->rx_opt.cookie_out_never &&
1290             (sysctl_tcp_cookie_size > 0 ||
1291              (tp->cookie_values != NULL &&
1292               tp->cookie_values->cookie_desired > 0))) {
1293                 u8 *c;
1294                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1295                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1296
1297                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1298                         goto drop_and_release;
1299
1300                 /* Secret recipe starts with IP addresses */
1301                 *mess++ ^= (__force u32)daddr;
1302                 *mess++ ^= (__force u32)saddr;
1303
1304                 /* plus variable length Initiator Cookie */
1305                 c = (u8 *)mess;
1306                 while (l-- > 0)
1307                         *c++ ^= *hash_location++;
1308
1309 #ifdef CONFIG_SYN_COOKIES
1310                 want_cookie = 0;        /* not our kind of cookie */
1311 #endif
1312                 tmp_ext.cookie_out_never = 0; /* false */
1313                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1314         } else if (!tp->rx_opt.cookie_in_always) {
1315                 /* redundant indications, but ensure initialization. */
1316                 tmp_ext.cookie_out_never = 1; /* true */
1317                 tmp_ext.cookie_plus = 0;
1318         } else {
1319                 goto drop_and_release;
1320         }
1321         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1322
1323         if (want_cookie && !tmp_opt.saw_tstamp)
1324                 tcp_clear_options(&tmp_opt);
1325
1326         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1327         tcp_openreq_init(req, &tmp_opt, skb);
1328
1329         ireq = inet_rsk(req);
1330         ireq->loc_addr = daddr;
1331         ireq->rmt_addr = saddr;
1332         ireq->no_srccheck = inet_sk(sk)->transparent;
1333         ireq->opt = tcp_v4_save_options(sk, skb);
1334
1335         if (security_inet_conn_request(sk, skb, req))
1336                 goto drop_and_free;
1337
1338         if (!want_cookie || tmp_opt.tstamp_ok)
1339                 TCP_ECN_create_request(req, tcp_hdr(skb));
1340
1341         if (want_cookie) {
1342                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1343                 req->cookie_ts = tmp_opt.tstamp_ok;
1344         } else if (!isn) {
1345                 struct inet_peer *peer = NULL;
1346                 struct flowi4 fl4;
1347
1348                 /* VJ's idea. We save last timestamp seen
1349                  * from the destination in peer table, when entering
1350                  * state TIME-WAIT, and check against it before
1351                  * accepting new connection request.
1352                  *
1353                  * If "isn" is not zero, this request hit alive
1354                  * timewait bucket, so that all the necessary checks
1355                  * are made in the function processing timewait state.
1356                  */
1357                 if (tmp_opt.saw_tstamp &&
1358                     tcp_death_row.sysctl_tw_recycle &&
1359                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1360                     fl4.daddr == saddr &&
1361                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1362                         inet_peer_refcheck(peer);
1363                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1364                             (s32)(peer->tcp_ts - req->ts_recent) >
1365                                                         TCP_PAWS_WINDOW) {
1366                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1367                                 goto drop_and_release;
1368                         }
1369                 }
1370                 /* Kill the following clause, if you dislike this way. */
1371                 else if (!sysctl_tcp_syncookies &&
1372                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1373                           (sysctl_max_syn_backlog >> 2)) &&
1374                          (!peer || !peer->tcp_ts_stamp) &&
1375                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1376                         /* Without syncookies last quarter of
1377                          * backlog is filled with destinations,
1378                          * proven to be alive.
1379                          * It means that we continue to communicate
1380                          * to destinations, already remembered
1381                          * to the moment of synflood.
1382                          */
1383                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1384                                        &saddr, ntohs(tcp_hdr(skb)->source));
1385                         goto drop_and_release;
1386                 }
1387
1388                 isn = tcp_v4_init_sequence(skb);
1389         }
1390         tcp_rsk(req)->snt_isn = isn;
1391
1392         if (tcp_v4_send_synack(sk, dst, req,
1393                                (struct request_values *)&tmp_ext) ||
1394             want_cookie)
1395                 goto drop_and_free;
1396
1397         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1398         return 0;
1399
1400 drop_and_release:
1401         dst_release(dst);
1402 drop_and_free:
1403         reqsk_free(req);
1404 drop:
1405         return 0;
1406 }
1407 EXPORT_SYMBOL(tcp_v4_conn_request);
1408
1409
1410 /*
1411  * The three way handshake has completed - we got a valid synack -
1412  * now create the new socket.
1413  */
1414 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1415                                   struct request_sock *req,
1416                                   struct dst_entry *dst)
1417 {
1418         struct inet_request_sock *ireq;
1419         struct inet_sock *newinet;
1420         struct tcp_sock *newtp;
1421         struct sock *newsk;
1422 #ifdef CONFIG_TCP_MD5SIG
1423         struct tcp_md5sig_key *key;
1424 #endif
1425         struct ip_options_rcu *inet_opt;
1426
1427         if (sk_acceptq_is_full(sk))
1428                 goto exit_overflow;
1429
1430         newsk = tcp_create_openreq_child(sk, req, skb);
1431         if (!newsk)
1432                 goto exit_nonewsk;
1433
1434         newsk->sk_gso_type = SKB_GSO_TCPV4;
1435
1436         newtp                 = tcp_sk(newsk);
1437         newinet               = inet_sk(newsk);
1438         ireq                  = inet_rsk(req);
1439         newinet->inet_daddr   = ireq->rmt_addr;
1440         newinet->inet_rcv_saddr = ireq->loc_addr;
1441         newinet->inet_saddr           = ireq->loc_addr;
1442         inet_opt              = ireq->opt;
1443         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1444         ireq->opt             = NULL;
1445         newinet->mc_index     = inet_iif(skb);
1446         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1447         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1448         if (inet_opt)
1449                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1450         newinet->inet_id = newtp->write_seq ^ jiffies;
1451
1452         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1453                 goto put_and_exit;
1454
1455         sk_setup_caps(newsk, dst);
1456
1457         tcp_mtup_init(newsk);
1458         tcp_sync_mss(newsk, dst_mtu(dst));
1459         newtp->advmss = dst_metric_advmss(dst);
1460         if (tcp_sk(sk)->rx_opt.user_mss &&
1461             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1462                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1463
1464         tcp_initialize_rcv_mss(newsk);
1465
1466 #ifdef CONFIG_TCP_MD5SIG
1467         /* Copy over the MD5 key from the original socket */
1468         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1469         if (key != NULL) {
1470                 /*
1471                  * We're using one, so create a matching key
1472                  * on the newsk structure. If we fail to get
1473                  * memory, then we end up not copying the key
1474                  * across. Shucks.
1475                  */
1476                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1477                 if (newkey != NULL)
1478                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1479                                           newkey, key->keylen);
1480                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1481         }
1482 #endif
1483
1484         if (__inet_inherit_port(sk, newsk) < 0)
1485                 goto put_and_exit;
1486         __inet_hash_nolisten(newsk, NULL);
1487
1488         return newsk;
1489
1490 exit_overflow:
1491         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1492 exit_nonewsk:
1493         dst_release(dst);
1494 exit:
1495         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1496         return NULL;
1497 put_and_exit:
1498         sock_put(newsk);
1499         goto exit;
1500 }
1501 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1502
1503 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1504 {
1505         struct tcphdr *th = tcp_hdr(skb);
1506         const struct iphdr *iph = ip_hdr(skb);
1507         struct sock *nsk;
1508         struct request_sock **prev;
1509         /* Find possible connection requests. */
1510         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1511                                                        iph->saddr, iph->daddr);
1512         if (req)
1513                 return tcp_check_req(sk, skb, req, prev);
1514
1515         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1516                         th->source, iph->daddr, th->dest, inet_iif(skb));
1517
1518         if (nsk) {
1519                 if (nsk->sk_state != TCP_TIME_WAIT) {
1520                         bh_lock_sock(nsk);
1521                         return nsk;
1522                 }
1523                 inet_twsk_put(inet_twsk(nsk));
1524                 return NULL;
1525         }
1526
1527 #ifdef CONFIG_SYN_COOKIES
1528         if (!th->syn)
1529                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1530 #endif
1531         return sk;
1532 }
1533
1534 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1535 {
1536         const struct iphdr *iph = ip_hdr(skb);
1537
1538         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1539                 if (!tcp_v4_check(skb->len, iph->saddr,
1540                                   iph->daddr, skb->csum)) {
1541                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1542                         return 0;
1543                 }
1544         }
1545
1546         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1547                                        skb->len, IPPROTO_TCP, 0);
1548
1549         if (skb->len <= 76) {
1550                 return __skb_checksum_complete(skb);
1551         }
1552         return 0;
1553 }
1554
1555
1556 /* The socket must have it's spinlock held when we get
1557  * here.
1558  *
1559  * We have a potential double-lock case here, so even when
1560  * doing backlog processing we use the BH locking scheme.
1561  * This is because we cannot sleep with the original spinlock
1562  * held.
1563  */
1564 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1565 {
1566         struct sock *rsk;
1567 #ifdef CONFIG_TCP_MD5SIG
1568         /*
1569          * We really want to reject the packet as early as possible
1570          * if:
1571          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1572          *  o There is an MD5 option and we're not expecting one
1573          */
1574         if (tcp_v4_inbound_md5_hash(sk, skb))
1575                 goto discard;
1576 #endif
1577
1578         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1579                 sock_rps_save_rxhash(sk, skb->rxhash);
1580                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1581                         rsk = sk;
1582                         goto reset;
1583                 }
1584                 return 0;
1585         }
1586
1587         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1588                 goto csum_err;
1589
1590         if (sk->sk_state == TCP_LISTEN) {
1591                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1592                 if (!nsk)
1593                         goto discard;
1594
1595                 if (nsk != sk) {
1596                         sock_rps_save_rxhash(nsk, skb->rxhash);
1597                         if (tcp_child_process(sk, nsk, skb)) {
1598                                 rsk = nsk;
1599                                 goto reset;
1600                         }
1601                         return 0;
1602                 }
1603         } else
1604                 sock_rps_save_rxhash(sk, skb->rxhash);
1605
1606         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1607                 rsk = sk;
1608                 goto reset;
1609         }
1610         return 0;
1611
1612 reset:
1613         tcp_v4_send_reset(rsk, skb);
1614 discard:
1615         kfree_skb(skb);
1616         /* Be careful here. If this function gets more complicated and
1617          * gcc suffers from register pressure on the x86, sk (in %ebx)
1618          * might be destroyed here. This current version compiles correctly,
1619          * but you have been warned.
1620          */
1621         return 0;
1622
1623 csum_err:
1624         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1625         goto discard;
1626 }
1627 EXPORT_SYMBOL(tcp_v4_do_rcv);
1628
1629 /*
1630  *      From tcp_input.c
1631  */
1632
1633 int tcp_v4_rcv(struct sk_buff *skb)
1634 {
1635         const struct iphdr *iph;
1636         struct tcphdr *th;
1637         struct sock *sk;
1638         int ret;
1639         struct net *net = dev_net(skb->dev);
1640
1641         if (skb->pkt_type != PACKET_HOST)
1642                 goto discard_it;
1643
1644         /* Count it even if it's bad */
1645         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1646
1647         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1648                 goto discard_it;
1649
1650         th = tcp_hdr(skb);
1651
1652         if (th->doff < sizeof(struct tcphdr) / 4)
1653                 goto bad_packet;
1654         if (!pskb_may_pull(skb, th->doff * 4))
1655                 goto discard_it;
1656
1657         /* An explanation is required here, I think.
1658          * Packet length and doff are validated by header prediction,
1659          * provided case of th->doff==0 is eliminated.
1660          * So, we defer the checks. */
1661         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1662                 goto bad_packet;
1663
1664         th = tcp_hdr(skb);
1665         iph = ip_hdr(skb);
1666         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1667         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1668                                     skb->len - th->doff * 4);
1669         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1670         TCP_SKB_CB(skb)->when    = 0;
1671         TCP_SKB_CB(skb)->flags   = iph->tos;
1672         TCP_SKB_CB(skb)->sacked  = 0;
1673
1674         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1675         if (!sk)
1676                 goto no_tcp_socket;
1677
1678 process:
1679         if (sk->sk_state == TCP_TIME_WAIT)
1680                 goto do_time_wait;
1681
1682         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1683                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1684                 goto discard_and_relse;
1685         }
1686
1687         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1688                 goto discard_and_relse;
1689         nf_reset(skb);
1690
1691         if (sk_filter(sk, skb))
1692                 goto discard_and_relse;
1693
1694         skb->dev = NULL;
1695
1696         bh_lock_sock_nested(sk);
1697         ret = 0;
1698         if (!sock_owned_by_user(sk)) {
1699 #ifdef CONFIG_NET_DMA
1700                 struct tcp_sock *tp = tcp_sk(sk);
1701                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1702                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1703                 if (tp->ucopy.dma_chan)
1704                         ret = tcp_v4_do_rcv(sk, skb);
1705                 else
1706 #endif
1707                 {
1708                         if (!tcp_prequeue(sk, skb))
1709                                 ret = tcp_v4_do_rcv(sk, skb);
1710                 }
1711         } else if (unlikely(sk_add_backlog(sk, skb))) {
1712                 bh_unlock_sock(sk);
1713                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1714                 goto discard_and_relse;
1715         }
1716         bh_unlock_sock(sk);
1717
1718         sock_put(sk);
1719
1720         return ret;
1721
1722 no_tcp_socket:
1723         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1724                 goto discard_it;
1725
1726         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1727 bad_packet:
1728                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1729         } else {
1730                 tcp_v4_send_reset(NULL, skb);
1731         }
1732
1733 discard_it:
1734         /* Discard frame. */
1735         kfree_skb(skb);
1736         return 0;
1737
1738 discard_and_relse:
1739         sock_put(sk);
1740         goto discard_it;
1741
1742 do_time_wait:
1743         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1744                 inet_twsk_put(inet_twsk(sk));
1745                 goto discard_it;
1746         }
1747
1748         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1749                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1750                 inet_twsk_put(inet_twsk(sk));
1751                 goto discard_it;
1752         }
1753         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1754         case TCP_TW_SYN: {
1755                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1756                                                         &tcp_hashinfo,
1757                                                         iph->daddr, th->dest,
1758                                                         inet_iif(skb));
1759                 if (sk2) {
1760                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1761                         inet_twsk_put(inet_twsk(sk));
1762                         sk = sk2;
1763                         goto process;
1764                 }
1765                 /* Fall through to ACK */
1766         }
1767         case TCP_TW_ACK:
1768                 tcp_v4_timewait_ack(sk, skb);
1769                 break;
1770         case TCP_TW_RST:
1771                 goto no_tcp_socket;
1772         case TCP_TW_SUCCESS:;
1773         }
1774         goto discard_it;
1775 }
1776
1777 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1778 {
1779         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1780         struct inet_sock *inet = inet_sk(sk);
1781         struct inet_peer *peer;
1782
1783         if (!rt ||
1784             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1785                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1786                 *release_it = true;
1787         } else {
1788                 if (!rt->peer)
1789                         rt_bind_peer(rt, inet->inet_daddr, 1);
1790                 peer = rt->peer;
1791                 *release_it = false;
1792         }
1793
1794         return peer;
1795 }
1796 EXPORT_SYMBOL(tcp_v4_get_peer);
1797
1798 void *tcp_v4_tw_get_peer(struct sock *sk)
1799 {
1800         struct inet_timewait_sock *tw = inet_twsk(sk);
1801
1802         return inet_getpeer_v4(tw->tw_daddr, 1);
1803 }
1804 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1805
1806 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1807         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1808         .twsk_unique    = tcp_twsk_unique,
1809         .twsk_destructor= tcp_twsk_destructor,
1810         .twsk_getpeer   = tcp_v4_tw_get_peer,
1811 };
1812
1813 const struct inet_connection_sock_af_ops ipv4_specific = {
1814         .queue_xmit        = ip_queue_xmit,
1815         .send_check        = tcp_v4_send_check,
1816         .rebuild_header    = inet_sk_rebuild_header,
1817         .conn_request      = tcp_v4_conn_request,
1818         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1819         .get_peer          = tcp_v4_get_peer,
1820         .net_header_len    = sizeof(struct iphdr),
1821         .setsockopt        = ip_setsockopt,
1822         .getsockopt        = ip_getsockopt,
1823         .addr2sockaddr     = inet_csk_addr2sockaddr,
1824         .sockaddr_len      = sizeof(struct sockaddr_in),
1825         .bind_conflict     = inet_csk_bind_conflict,
1826 #ifdef CONFIG_COMPAT
1827         .compat_setsockopt = compat_ip_setsockopt,
1828         .compat_getsockopt = compat_ip_getsockopt,
1829 #endif
1830 };
1831 EXPORT_SYMBOL(ipv4_specific);
1832
1833 #ifdef CONFIG_TCP_MD5SIG
1834 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1835         .md5_lookup             = tcp_v4_md5_lookup,
1836         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1837         .md5_add                = tcp_v4_md5_add_func,
1838         .md5_parse              = tcp_v4_parse_md5_keys,
1839 };
1840 #endif
1841
1842 /* NOTE: A lot of things set to zero explicitly by call to
1843  *       sk_alloc() so need not be done here.
1844  */
1845 static int tcp_v4_init_sock(struct sock *sk)
1846 {
1847         struct inet_connection_sock *icsk = inet_csk(sk);
1848         struct tcp_sock *tp = tcp_sk(sk);
1849
1850         skb_queue_head_init(&tp->out_of_order_queue);
1851         tcp_init_xmit_timers(sk);
1852         tcp_prequeue_init(tp);
1853
1854         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1855         tp->mdev = TCP_TIMEOUT_INIT;
1856
1857         /* So many TCP implementations out there (incorrectly) count the
1858          * initial SYN frame in their delayed-ACK and congestion control
1859          * algorithms that we must have the following bandaid to talk
1860          * efficiently to them.  -DaveM
1861          */
1862         tp->snd_cwnd = 2;
1863
1864         /* See draft-stevens-tcpca-spec-01 for discussion of the
1865          * initialization of these values.
1866          */
1867         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1868         tp->snd_cwnd_clamp = ~0;
1869         tp->mss_cache = TCP_MSS_DEFAULT;
1870
1871         tp->reordering = sysctl_tcp_reordering;
1872         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1873
1874         sk->sk_state = TCP_CLOSE;
1875
1876         sk->sk_write_space = sk_stream_write_space;
1877         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1878
1879         icsk->icsk_af_ops = &ipv4_specific;
1880         icsk->icsk_sync_mss = tcp_sync_mss;
1881 #ifdef CONFIG_TCP_MD5SIG
1882         tp->af_specific = &tcp_sock_ipv4_specific;
1883 #endif
1884
1885         /* TCP Cookie Transactions */
1886         if (sysctl_tcp_cookie_size > 0) {
1887                 /* Default, cookies without s_data_payload. */
1888                 tp->cookie_values =
1889                         kzalloc(sizeof(*tp->cookie_values),
1890                                 sk->sk_allocation);
1891                 if (tp->cookie_values != NULL)
1892                         kref_init(&tp->cookie_values->kref);
1893         }
1894         /* Presumed zeroed, in order of appearance:
1895          *      cookie_in_always, cookie_out_never,
1896          *      s_data_constant, s_data_in, s_data_out
1897          */
1898         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1899         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1900
1901         local_bh_disable();
1902         percpu_counter_inc(&tcp_sockets_allocated);
1903         local_bh_enable();
1904
1905         return 0;
1906 }
1907
1908 void tcp_v4_destroy_sock(struct sock *sk)
1909 {
1910         struct tcp_sock *tp = tcp_sk(sk);
1911
1912         tcp_clear_xmit_timers(sk);
1913
1914         tcp_cleanup_congestion_control(sk);
1915
1916         /* Cleanup up the write buffer. */
1917         tcp_write_queue_purge(sk);
1918
1919         /* Cleans up our, hopefully empty, out_of_order_queue. */
1920         __skb_queue_purge(&tp->out_of_order_queue);
1921
1922 #ifdef CONFIG_TCP_MD5SIG
1923         /* Clean up the MD5 key list, if any */
1924         if (tp->md5sig_info) {
1925                 tcp_v4_clear_md5_list(sk);
1926                 kfree(tp->md5sig_info);
1927                 tp->md5sig_info = NULL;
1928         }
1929 #endif
1930
1931 #ifdef CONFIG_NET_DMA
1932         /* Cleans up our sk_async_wait_queue */
1933         __skb_queue_purge(&sk->sk_async_wait_queue);
1934 #endif
1935
1936         /* Clean prequeue, it must be empty really */
1937         __skb_queue_purge(&tp->ucopy.prequeue);
1938
1939         /* Clean up a referenced TCP bind bucket. */
1940         if (inet_csk(sk)->icsk_bind_hash)
1941                 inet_put_port(sk);
1942
1943         /*
1944          * If sendmsg cached page exists, toss it.
1945          */
1946         if (sk->sk_sndmsg_page) {
1947                 __free_page(sk->sk_sndmsg_page);
1948                 sk->sk_sndmsg_page = NULL;
1949         }
1950
1951         /* TCP Cookie Transactions */
1952         if (tp->cookie_values != NULL) {
1953                 kref_put(&tp->cookie_values->kref,
1954                          tcp_cookie_values_release);
1955                 tp->cookie_values = NULL;
1956         }
1957
1958         percpu_counter_dec(&tcp_sockets_allocated);
1959 }
1960 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1961
1962 #ifdef CONFIG_PROC_FS
1963 /* Proc filesystem TCP sock list dumping. */
1964
1965 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1966 {
1967         return hlist_nulls_empty(head) ? NULL :
1968                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1969 }
1970
1971 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1972 {
1973         return !is_a_nulls(tw->tw_node.next) ?
1974                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1975 }
1976
1977 /*
1978  * Get next listener socket follow cur.  If cur is NULL, get first socket
1979  * starting from bucket given in st->bucket; when st->bucket is zero the
1980  * very first socket in the hash table is returned.
1981  */
1982 static void *listening_get_next(struct seq_file *seq, void *cur)
1983 {
1984         struct inet_connection_sock *icsk;
1985         struct hlist_nulls_node *node;
1986         struct sock *sk = cur;
1987         struct inet_listen_hashbucket *ilb;
1988         struct tcp_iter_state *st = seq->private;
1989         struct net *net = seq_file_net(seq);
1990
1991         if (!sk) {
1992                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1993                 spin_lock_bh(&ilb->lock);
1994                 sk = sk_nulls_head(&ilb->head);
1995                 st->offset = 0;
1996                 goto get_sk;
1997         }
1998         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1999         ++st->num;
2000         ++st->offset;
2001
2002         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2003                 struct request_sock *req = cur;
2004
2005                 icsk = inet_csk(st->syn_wait_sk);
2006                 req = req->dl_next;
2007                 while (1) {
2008                         while (req) {
2009                                 if (req->rsk_ops->family == st->family) {
2010                                         cur = req;
2011                                         goto out;
2012                                 }
2013                                 req = req->dl_next;
2014                         }
2015                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2016                                 break;
2017 get_req:
2018                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2019                 }
2020                 sk        = sk_nulls_next(st->syn_wait_sk);
2021                 st->state = TCP_SEQ_STATE_LISTENING;
2022                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2023         } else {
2024                 icsk = inet_csk(sk);
2025                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2026                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2027                         goto start_req;
2028                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2029                 sk = sk_nulls_next(sk);
2030         }
2031 get_sk:
2032         sk_nulls_for_each_from(sk, node) {
2033                 if (!net_eq(sock_net(sk), net))
2034                         continue;
2035                 if (sk->sk_family == st->family) {
2036                         cur = sk;
2037                         goto out;
2038                 }
2039                 icsk = inet_csk(sk);
2040                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2041                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2042 start_req:
2043                         st->uid         = sock_i_uid(sk);
2044                         st->syn_wait_sk = sk;
2045                         st->state       = TCP_SEQ_STATE_OPENREQ;
2046                         st->sbucket     = 0;
2047                         goto get_req;
2048                 }
2049                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2050         }
2051         spin_unlock_bh(&ilb->lock);
2052         st->offset = 0;
2053         if (++st->bucket < INET_LHTABLE_SIZE) {
2054                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2055                 spin_lock_bh(&ilb->lock);
2056                 sk = sk_nulls_head(&ilb->head);
2057                 goto get_sk;
2058         }
2059         cur = NULL;
2060 out:
2061         return cur;
2062 }
2063
2064 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2065 {
2066         struct tcp_iter_state *st = seq->private;
2067         void *rc;
2068
2069         st->bucket = 0;
2070         st->offset = 0;
2071         rc = listening_get_next(seq, NULL);
2072
2073         while (rc && *pos) {
2074                 rc = listening_get_next(seq, rc);
2075                 --*pos;
2076         }
2077         return rc;
2078 }
2079
2080 static inline int empty_bucket(struct tcp_iter_state *st)
2081 {
2082         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2083                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2084 }
2085
2086 /*
2087  * Get first established socket starting from bucket given in st->bucket.
2088  * If st->bucket is zero, the very first socket in the hash is returned.
2089  */
2090 static void *established_get_first(struct seq_file *seq)
2091 {
2092         struct tcp_iter_state *st = seq->private;
2093         struct net *net = seq_file_net(seq);
2094         void *rc = NULL;
2095
2096         st->offset = 0;
2097         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2098                 struct sock *sk;
2099                 struct hlist_nulls_node *node;
2100                 struct inet_timewait_sock *tw;
2101                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2102
2103                 /* Lockless fast path for the common case of empty buckets */
2104                 if (empty_bucket(st))
2105                         continue;
2106
2107                 spin_lock_bh(lock);
2108                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2109                         if (sk->sk_family != st->family ||
2110                             !net_eq(sock_net(sk), net)) {
2111                                 continue;
2112                         }
2113                         rc = sk;
2114                         goto out;
2115                 }
2116                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2117                 inet_twsk_for_each(tw, node,
2118                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2119                         if (tw->tw_family != st->family ||
2120                             !net_eq(twsk_net(tw), net)) {
2121                                 continue;
2122                         }
2123                         rc = tw;
2124                         goto out;
2125                 }
2126                 spin_unlock_bh(lock);
2127                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2128         }
2129 out:
2130         return rc;
2131 }
2132
2133 static void *established_get_next(struct seq_file *seq, void *cur)
2134 {
2135         struct sock *sk = cur;
2136         struct inet_timewait_sock *tw;
2137         struct hlist_nulls_node *node;
2138         struct tcp_iter_state *st = seq->private;
2139         struct net *net = seq_file_net(seq);
2140
2141         ++st->num;
2142         ++st->offset;
2143
2144         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2145                 tw = cur;
2146                 tw = tw_next(tw);
2147 get_tw:
2148                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2149                         tw = tw_next(tw);
2150                 }
2151                 if (tw) {
2152                         cur = tw;
2153                         goto out;
2154                 }
2155                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2156                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2157
2158                 /* Look for next non empty bucket */
2159                 st->offset = 0;
2160                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2161                                 empty_bucket(st))
2162                         ;
2163                 if (st->bucket > tcp_hashinfo.ehash_mask)
2164                         return NULL;
2165
2166                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2167                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2168         } else
2169                 sk = sk_nulls_next(sk);
2170
2171         sk_nulls_for_each_from(sk, node) {
2172                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2173                         goto found;
2174         }
2175
2176         st->state = TCP_SEQ_STATE_TIME_WAIT;
2177         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2178         goto get_tw;
2179 found:
2180         cur = sk;
2181 out:
2182         return cur;
2183 }
2184
2185 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2186 {
2187         struct tcp_iter_state *st = seq->private;
2188         void *rc;
2189
2190         st->bucket = 0;
2191         rc = established_get_first(seq);
2192
2193         while (rc && pos) {
2194                 rc = established_get_next(seq, rc);
2195                 --pos;
2196         }
2197         return rc;
2198 }
2199
2200 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2201 {
2202         void *rc;
2203         struct tcp_iter_state *st = seq->private;
2204
2205         st->state = TCP_SEQ_STATE_LISTENING;
2206         rc        = listening_get_idx(seq, &pos);
2207
2208         if (!rc) {
2209                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2210                 rc        = established_get_idx(seq, pos);
2211         }
2212
2213         return rc;
2214 }
2215
2216 static void *tcp_seek_last_pos(struct seq_file *seq)
2217 {
2218         struct tcp_iter_state *st = seq->private;
2219         int offset = st->offset;
2220         int orig_num = st->num;
2221         void *rc = NULL;
2222
2223         switch (st->state) {
2224         case TCP_SEQ_STATE_OPENREQ:
2225         case TCP_SEQ_STATE_LISTENING:
2226                 if (st->bucket >= INET_LHTABLE_SIZE)
2227                         break;
2228                 st->state = TCP_SEQ_STATE_LISTENING;
2229                 rc = listening_get_next(seq, NULL);
2230                 while (offset-- && rc)
2231                         rc = listening_get_next(seq, rc);
2232                 if (rc)
2233                         break;
2234                 st->bucket = 0;
2235                 /* Fallthrough */
2236         case TCP_SEQ_STATE_ESTABLISHED:
2237         case TCP_SEQ_STATE_TIME_WAIT:
2238                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2239                 if (st->bucket > tcp_hashinfo.ehash_mask)
2240                         break;
2241                 rc = established_get_first(seq);
2242                 while (offset-- && rc)
2243                         rc = established_get_next(seq, rc);
2244         }
2245
2246         st->num = orig_num;
2247
2248         return rc;
2249 }
2250
2251 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2252 {
2253         struct tcp_iter_state *st = seq->private;
2254         void *rc;
2255
2256         if (*pos && *pos == st->last_pos) {
2257                 rc = tcp_seek_last_pos(seq);
2258                 if (rc)
2259                         goto out;
2260         }
2261
2262         st->state = TCP_SEQ_STATE_LISTENING;
2263         st->num = 0;
2264         st->bucket = 0;
2265         st->offset = 0;
2266         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2267
2268 out:
2269         st->last_pos = *pos;
2270         return rc;
2271 }
2272
2273 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2274 {
2275         struct tcp_iter_state *st = seq->private;
2276         void *rc = NULL;
2277
2278         if (v == SEQ_START_TOKEN) {
2279                 rc = tcp_get_idx(seq, 0);
2280                 goto out;
2281         }
2282
2283         switch (st->state) {
2284         case TCP_SEQ_STATE_OPENREQ:
2285         case TCP_SEQ_STATE_LISTENING:
2286                 rc = listening_get_next(seq, v);
2287                 if (!rc) {
2288                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2289                         st->bucket = 0;
2290                         st->offset = 0;
2291                         rc        = established_get_first(seq);
2292                 }
2293                 break;
2294         case TCP_SEQ_STATE_ESTABLISHED:
2295         case TCP_SEQ_STATE_TIME_WAIT:
2296                 rc = established_get_next(seq, v);
2297                 break;
2298         }
2299 out:
2300         ++*pos;
2301         st->last_pos = *pos;
2302         return rc;
2303 }
2304
2305 static void tcp_seq_stop(struct seq_file *seq, void *v)
2306 {
2307         struct tcp_iter_state *st = seq->private;
2308
2309         switch (st->state) {
2310         case TCP_SEQ_STATE_OPENREQ:
2311                 if (v) {
2312                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2313                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2314                 }
2315         case TCP_SEQ_STATE_LISTENING:
2316                 if (v != SEQ_START_TOKEN)
2317                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2318                 break;
2319         case TCP_SEQ_STATE_TIME_WAIT:
2320         case TCP_SEQ_STATE_ESTABLISHED:
2321                 if (v)
2322                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2323                 break;
2324         }
2325 }
2326
2327 static int tcp_seq_open(struct inode *inode, struct file *file)
2328 {
2329         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2330         struct tcp_iter_state *s;
2331         int err;
2332
2333         err = seq_open_net(inode, file, &afinfo->seq_ops,
2334                           sizeof(struct tcp_iter_state));
2335         if (err < 0)
2336                 return err;
2337
2338         s = ((struct seq_file *)file->private_data)->private;
2339         s->family               = afinfo->family;
2340         s->last_pos             = 0;
2341         return 0;
2342 }
2343
2344 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2345 {
2346         int rc = 0;
2347         struct proc_dir_entry *p;
2348
2349         afinfo->seq_fops.open           = tcp_seq_open;
2350         afinfo->seq_fops.read           = seq_read;
2351         afinfo->seq_fops.llseek         = seq_lseek;
2352         afinfo->seq_fops.release        = seq_release_net;
2353
2354         afinfo->seq_ops.start           = tcp_seq_start;
2355         afinfo->seq_ops.next            = tcp_seq_next;
2356         afinfo->seq_ops.stop            = tcp_seq_stop;
2357
2358         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2359                              &afinfo->seq_fops, afinfo);
2360         if (!p)
2361                 rc = -ENOMEM;
2362         return rc;
2363 }
2364 EXPORT_SYMBOL(tcp_proc_register);
2365
2366 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2367 {
2368         proc_net_remove(net, afinfo->name);
2369 }
2370 EXPORT_SYMBOL(tcp_proc_unregister);
2371
2372 static void get_openreq4(struct sock *sk, struct request_sock *req,
2373                          struct seq_file *f, int i, int uid, int *len)
2374 {
2375         const struct inet_request_sock *ireq = inet_rsk(req);
2376         int ttd = req->expires - jiffies;
2377
2378         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2379                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2380                 i,
2381                 ireq->loc_addr,
2382                 ntohs(inet_sk(sk)->inet_sport),
2383                 ireq->rmt_addr,
2384                 ntohs(ireq->rmt_port),
2385                 TCP_SYN_RECV,
2386                 0, 0, /* could print option size, but that is af dependent. */
2387                 1,    /* timers active (only the expire timer) */
2388                 jiffies_to_clock_t(ttd),
2389                 req->retrans,
2390                 uid,
2391                 0,  /* non standard timer */
2392                 0, /* open_requests have no inode */
2393                 atomic_read(&sk->sk_refcnt),
2394                 req,
2395                 len);
2396 }
2397
2398 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2399 {
2400         int timer_active;
2401         unsigned long timer_expires;
2402         struct tcp_sock *tp = tcp_sk(sk);
2403         const struct inet_connection_sock *icsk = inet_csk(sk);
2404         struct inet_sock *inet = inet_sk(sk);
2405         __be32 dest = inet->inet_daddr;
2406         __be32 src = inet->inet_rcv_saddr;
2407         __u16 destp = ntohs(inet->inet_dport);
2408         __u16 srcp = ntohs(inet->inet_sport);
2409         int rx_queue;
2410
2411         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2412                 timer_active    = 1;
2413                 timer_expires   = icsk->icsk_timeout;
2414         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2415                 timer_active    = 4;
2416                 timer_expires   = icsk->icsk_timeout;
2417         } else if (timer_pending(&sk->sk_timer)) {
2418                 timer_active    = 2;
2419                 timer_expires   = sk->sk_timer.expires;
2420         } else {
2421                 timer_active    = 0;
2422                 timer_expires = jiffies;
2423         }
2424
2425         if (sk->sk_state == TCP_LISTEN)
2426                 rx_queue = sk->sk_ack_backlog;
2427         else
2428                 /*
2429                  * because we dont lock socket, we might find a transient negative value
2430                  */
2431                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2432
2433         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2434                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2435                 i, src, srcp, dest, destp, sk->sk_state,
2436                 tp->write_seq - tp->snd_una,
2437                 rx_queue,
2438                 timer_active,
2439                 jiffies_to_clock_t(timer_expires - jiffies),
2440                 icsk->icsk_retransmits,
2441                 sock_i_uid(sk),
2442                 icsk->icsk_probes_out,
2443                 sock_i_ino(sk),
2444                 atomic_read(&sk->sk_refcnt), sk,
2445                 jiffies_to_clock_t(icsk->icsk_rto),
2446                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2447                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2448                 tp->snd_cwnd,
2449                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2450                 len);
2451 }
2452
2453 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2454                                struct seq_file *f, int i, int *len)
2455 {
2456         __be32 dest, src;
2457         __u16 destp, srcp;
2458         int ttd = tw->tw_ttd - jiffies;
2459
2460         if (ttd < 0)
2461                 ttd = 0;
2462
2463         dest  = tw->tw_daddr;
2464         src   = tw->tw_rcv_saddr;
2465         destp = ntohs(tw->tw_dport);
2466         srcp  = ntohs(tw->tw_sport);
2467
2468         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2469                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2470                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2471                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2472                 atomic_read(&tw->tw_refcnt), tw, len);
2473 }
2474
2475 #define TMPSZ 150
2476
2477 static int tcp4_seq_show(struct seq_file *seq, void *v)
2478 {
2479         struct tcp_iter_state *st;
2480         int len;
2481
2482         if (v == SEQ_START_TOKEN) {
2483                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2484                            "  sl  local_address rem_address   st tx_queue "
2485                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2486                            "inode");
2487                 goto out;
2488         }
2489         st = seq->private;
2490
2491         switch (st->state) {
2492         case TCP_SEQ_STATE_LISTENING:
2493         case TCP_SEQ_STATE_ESTABLISHED:
2494                 get_tcp4_sock(v, seq, st->num, &len);
2495                 break;
2496         case TCP_SEQ_STATE_OPENREQ:
2497                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2498                 break;
2499         case TCP_SEQ_STATE_TIME_WAIT:
2500                 get_timewait4_sock(v, seq, st->num, &len);
2501                 break;
2502         }
2503         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2504 out:
2505         return 0;
2506 }
2507
2508 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2509         .name           = "tcp",
2510         .family         = AF_INET,
2511         .seq_fops       = {
2512                 .owner          = THIS_MODULE,
2513         },
2514         .seq_ops        = {
2515                 .show           = tcp4_seq_show,
2516         },
2517 };
2518
2519 static int __net_init tcp4_proc_init_net(struct net *net)
2520 {
2521         return tcp_proc_register(net, &tcp4_seq_afinfo);
2522 }
2523
2524 static void __net_exit tcp4_proc_exit_net(struct net *net)
2525 {
2526         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2527 }
2528
2529 static struct pernet_operations tcp4_net_ops = {
2530         .init = tcp4_proc_init_net,
2531         .exit = tcp4_proc_exit_net,
2532 };
2533
2534 int __init tcp4_proc_init(void)
2535 {
2536         return register_pernet_subsys(&tcp4_net_ops);
2537 }
2538
2539 void tcp4_proc_exit(void)
2540 {
2541         unregister_pernet_subsys(&tcp4_net_ops);
2542 }
2543 #endif /* CONFIG_PROC_FS */
2544
2545 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2546 {
2547         const struct iphdr *iph = skb_gro_network_header(skb);
2548
2549         switch (skb->ip_summed) {
2550         case CHECKSUM_COMPLETE:
2551                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2552                                   skb->csum)) {
2553                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2554                         break;
2555                 }
2556
2557                 /* fall through */
2558         case CHECKSUM_NONE:
2559                 NAPI_GRO_CB(skb)->flush = 1;
2560                 return NULL;
2561         }
2562
2563         return tcp_gro_receive(head, skb);
2564 }
2565
2566 int tcp4_gro_complete(struct sk_buff *skb)
2567 {
2568         const struct iphdr *iph = ip_hdr(skb);
2569         struct tcphdr *th = tcp_hdr(skb);
2570
2571         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2572                                   iph->saddr, iph->daddr, 0);
2573         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2574
2575         return tcp_gro_complete(skb);
2576 }
2577
2578 struct proto tcp_prot = {
2579         .name                   = "TCP",
2580         .owner                  = THIS_MODULE,
2581         .close                  = tcp_close,
2582         .connect                = tcp_v4_connect,
2583         .disconnect             = tcp_disconnect,
2584         .accept                 = inet_csk_accept,
2585         .ioctl                  = tcp_ioctl,
2586         .init                   = tcp_v4_init_sock,
2587         .destroy                = tcp_v4_destroy_sock,
2588         .shutdown               = tcp_shutdown,
2589         .setsockopt             = tcp_setsockopt,
2590         .getsockopt             = tcp_getsockopt,
2591         .recvmsg                = tcp_recvmsg,
2592         .sendmsg                = tcp_sendmsg,
2593         .sendpage               = tcp_sendpage,
2594         .backlog_rcv            = tcp_v4_do_rcv,
2595         .hash                   = inet_hash,
2596         .unhash                 = inet_unhash,
2597         .get_port               = inet_csk_get_port,
2598         .enter_memory_pressure  = tcp_enter_memory_pressure,
2599         .sockets_allocated      = &tcp_sockets_allocated,
2600         .orphan_count           = &tcp_orphan_count,
2601         .memory_allocated       = &tcp_memory_allocated,
2602         .memory_pressure        = &tcp_memory_pressure,
2603         .sysctl_mem             = sysctl_tcp_mem,
2604         .sysctl_wmem            = sysctl_tcp_wmem,
2605         .sysctl_rmem            = sysctl_tcp_rmem,
2606         .max_header             = MAX_TCP_HEADER,
2607         .obj_size               = sizeof(struct tcp_sock),
2608         .slab_flags             = SLAB_DESTROY_BY_RCU,
2609         .twsk_prot              = &tcp_timewait_sock_ops,
2610         .rsk_prot               = &tcp_request_sock_ops,
2611         .h.hashinfo             = &tcp_hashinfo,
2612         .no_autobind            = true,
2613 #ifdef CONFIG_COMPAT
2614         .compat_setsockopt      = compat_tcp_setsockopt,
2615         .compat_getsockopt      = compat_tcp_getsockopt,
2616 #endif
2617 };
2618 EXPORT_SYMBOL(tcp_prot);
2619
2620
2621 static int __net_init tcp_sk_init(struct net *net)
2622 {
2623         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2624                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2625 }
2626
2627 static void __net_exit tcp_sk_exit(struct net *net)
2628 {
2629         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2630 }
2631
2632 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2633 {
2634         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2635 }
2636
2637 static struct pernet_operations __net_initdata tcp_sk_ops = {
2638        .init       = tcp_sk_init,
2639        .exit       = tcp_sk_exit,
2640        .exit_batch = tcp_sk_exit_batch,
2641 };
2642
2643 void __init tcp_v4_init(void)
2644 {
2645         inet_hashinfo_init(&tcp_hashinfo);
2646         if (register_pernet_subsys(&tcp_sk_ops))
2647                 panic("Failed to create the TCP control socket.\n");
2648 }