net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75 #include <net/secure_seq.h>
  76
  77 #include <linux/inet.h>
  78 #include <linux/ipv6.h>
  79 #include <linux/stddef.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/seq_file.h>
  82
  83 #include <linux/crypto.h>
  84 #include <linux/scatterlist.h>
  85
  86 int sysctl_tcp_tw_reuse __read_mostly;
  87 int sysctl_tcp_low_latency __read_mostly;
  88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  89
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  96 #else
  97 static inline
  98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  99 {
 100         return NULL;
 101 }
 102 #endif
 103
 104 struct inet_hashinfo tcp_hashinfo;
 105 EXPORT_SYMBOL(tcp_hashinfo);
 106
 107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 108 {
 109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 110                                           ip_hdr(skb)->saddr,
 111                                           tcp_hdr(skb)->dest,
 112                                           tcp_hdr(skb)->source);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 118         struct tcp_sock *tp = tcp_sk(sk);
 119
 120         /* With PAWS, it is safe from the viewpoint
 121            of data integrity. Even without PAWS it is safe provided sequence
 122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 123
 124            Actually, the idea is close to VJ's one, only timestamp cache is
 125            held not per host, but per port pair and TW bucket is used as state
 126            holder.
 127
 128            If TW bucket has been already destroyed we fall back to VJ's scheme
 129            and use initial timestamp retrieved from peer table.
 130          */
 131         if (tcptw->tw_ts_recent_stamp &&
 132             (twp == NULL || (sysctl_tcp_tw_reuse &&
 133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 135                 if (tp->write_seq == 0)
 136                         tp->write_seq = 1;
 137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 139                 sock_hold(sktw);
 140                 return 1;
 141         }
 142
 143         return 0;
 144 }
 145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147 /* This will initiate an outgoing connection. */
 148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149 {
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct inet_sock *inet = inet_sk(sk);
 152         struct tcp_sock *tp = tcp_sk(sk);
 153         __be16 orig_sport, orig_dport;
 154         __be32 daddr, nexthop;
 155         struct flowi4 *fl4;
 156         struct rtable *rt;
 157         int err;
 158         struct ip_options_rcu *inet_opt;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         inet_opt = rcu_dereference_protected(inet->inet_opt,
 168                                              sock_owned_by_user(sk));
 169         if (inet_opt && inet_opt->opt.srr) {
 170                 if (!daddr)
 171                         return -EINVAL;
 172                 nexthop = inet_opt->opt.faddr;
 173         }
 174
 175         orig_sport = inet->inet_sport;
 176         orig_dport = usin->sin_port;
 177         fl4 = &inet->cork.fl.u.ip4;
 178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                               IPPROTO_TCP,
 181                               orig_sport, orig_dport, sk, true);
 182         if (IS_ERR(rt)) {
 183                 err = PTR_ERR(rt);
 184                 if (err == -ENETUNREACH)
 185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 186                 return err;
 187         }
 188
 189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 190                 ip_rt_put(rt);
 191                 return -ENETUNREACH;
 192         }
 193
 194         if (!inet_opt || !inet_opt->opt.srr)
 195                 daddr = fl4->daddr;
 196
 197         if (!inet->inet_saddr)
 198                 inet->inet_saddr = fl4->saddr;
 199         inet->inet_rcv_saddr = inet->inet_saddr;
 200
 201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 202                 /* Reset inherited state */
 203                 tp->rx_opt.ts_recent       = 0;
 204                 tp->rx_opt.ts_recent_stamp = 0;
 205                 tp->write_seq              = 0;
 206         }
 207
 208         if (tcp_death_row.sysctl_tw_recycle &&
 209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 211                 /*
 212                  * VJ's idea. We save last timestamp seen from
 213                  * the destination in peer table, when entering state
 214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 215                  * when trying new connection.
 216                  */
 217                 if (peer) {
 218                         inet_peer_refcheck(peer);
 219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 222                         }
 223                 }
 224         }
 225
 226         inet->inet_dport = usin->sin_port;
 227         inet->inet_daddr = daddr;
 228
 229         inet_csk(sk)->icsk_ext_hdr_len = 0;
 230         if (inet_opt)
 231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 232
 233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 234
 235         /* Socket identity is still unknown (sport may be zero).
 236          * However we set state to SYN-SENT and not releasing socket
 237          * lock select source port, enter ourselves into the hash tables and
 238          * complete initialization after this.
 239          */
 240         tcp_set_state(sk, TCP_SYN_SENT);
 241         err = inet_hash_connect(&tcp_death_row, sk);
 242         if (err)
 243                 goto failure;
 244
 245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 246                                inet->inet_sport, inet->inet_dport, sk);
 247         if (IS_ERR(rt)) {
 248                 err = PTR_ERR(rt);
 249                 rt = NULL;
 250                 goto failure;
 251         }
 252         /* OK, now commit destination to socket.  */
 253         sk->sk_gso_type = SKB_GSO_TCPV4;
 254         sk_setup_caps(sk, &rt->dst);
 255
 256         if (!tp->write_seq)
 257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 258                                                            inet->inet_daddr,
 259                                                            inet->inet_sport,
 260                                                            usin->sin_port);
 261
 262         inet->inet_id = tp->write_seq ^ jiffies;
 263
 264         err = tcp_connect(sk);
 265         rt = NULL;
 266         if (err)
 267                 goto failure;
 268
 269         return 0;
 270
 271 failure:
 272         /*
 273          * This unhashes the socket and releases the local port,
 274          * if necessary.
 275          */
 276         tcp_set_state(sk, TCP_CLOSE);
 277         ip_rt_put(rt);
 278         sk->sk_route_caps = 0;
 279         inet->inet_dport = 0;
 280         return err;
 281 }
 282 EXPORT_SYMBOL(tcp_v4_connect);
 283
 284 /*
 285  * This routine does path mtu discovery as defined in RFC1191.
 286  */
 287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 288 {
 289         struct dst_entry *dst;
 290         struct inet_sock *inet = inet_sk(sk);
 291
 292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 293          * send out by Linux are always <576bytes so they should go through
 294          * unfragmented).
 295          */
 296         if (sk->sk_state == TCP_LISTEN)
 297                 return;
 298
 299         /* We don't check in the destentry if pmtu discovery is forbidden
 300          * on this route. We just assume that no packet_to_big packets
 301          * are send back when pmtu discovery is not active.
 302          * There is a small race when the user changes this flag in the
 303          * route, but I think that's acceptable.
 304          */
 305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 306                 return;
 307
 308         dst->ops->update_pmtu(dst, mtu);
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 320                 tcp_sync_mss(sk, mtu);
 321
 322                 /* Resend the TCP packet because it's
 323                  * clear that the old packet has been
 324                  * dropped. This is the new "fast" path mtu
 325                  * discovery.
 326                  */
 327                 tcp_simple_retransmit(sk);
 328         } /* else let the usual retransmit timer handle it */
 329 }
 330
 331 /*
 332  * This routine is called by the ICMP module when it gets some
 333  * sort of error condition.  If err < 0 then the socket should
 334  * be closed and the error returned to the user.  If err > 0
 335  * it's just the icmp type << 8 | icmp code.  After adjustment
 336  * header points to the first 8 bytes of the tcp header.  We need
 337  * to find the appropriate port.
 338  *
 339  * The locking strategy used here is very "optimistic". When
 340  * someone else accesses the socket the ICMP is just dropped
 341  * and for some paths there is no check at all.
 342  * A more general error queue to queue errors for later handling
 343  * is probably better.
 344  *
 345  */
 346
 347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 348 {
 349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 351         struct inet_connection_sock *icsk;
 352         struct tcp_sock *tp;
 353         struct inet_sock *inet;
 354         const int type = icmp_hdr(icmp_skb)->type;
 355         const int code = icmp_hdr(icmp_skb)->code;
 356         struct sock *sk;
 357         struct sk_buff *skb;
 358         __u32 seq;
 359         __u32 remaining;
 360         int err;
 361         struct net *net = dev_net(icmp_skb->dev);
 362
 363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367
 368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 369                         iph->saddr, th->source, inet_iif(icmp_skb));
 370         if (!sk) {
 371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 372                 return;
 373         }
 374         if (sk->sk_state == TCP_TIME_WAIT) {
 375                 inet_twsk_put(inet_twsk(sk));
 376                 return;
 377         }
 378
 379         bh_lock_sock(sk);
 380         /* If too many ICMPs get dropped on busy
 381          * servers this needs to be solved differently.
 382          */
 383         if (sock_owned_by_user(sk))
 384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 385
 386         if (sk->sk_state == TCP_CLOSE)
 387                 goto out;
 388
 389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 391                 goto out;
 392         }
 393
 394         icsk = inet_csk(sk);
 395         tp = tcp_sk(sk);
 396         seq = ntohl(th->seq);
 397         if (sk->sk_state != TCP_LISTEN &&
 398             !between(seq, tp->snd_una, tp->snd_nxt)) {
 399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 400                 goto out;
 401         }
 402
 403         switch (type) {
 404         case ICMP_SOURCE_QUENCH:
 405                 /* Just silently ignore these. */
 406                 goto out;
 407         case ICMP_PARAMETERPROB:
 408                 err = EPROTO;
 409                 break;
 410         case ICMP_DEST_UNREACH:
 411                 if (code > NR_ICMP_UNREACH)
 412                         goto out;
 413
 414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 415                         if (!sock_owned_by_user(sk))
 416                                 do_pmtu_discovery(sk, iph, info);
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 434                                          icsk->icsk_backoff;
 435                 tcp_bound_rto(sk);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 442
 443                 if (remaining) {
 444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                   remaining, TCP_RTO_MAX);
 446                 } else {
 447                         /* RTO revert clocked out retransmission.
 448                          * Will retransmit now */
 449                         tcp_retransmit_timer(sk);
 450                 }
 451
 452                 break;
 453         case ICMP_TIME_EXCEEDED:
 454                 err = EHOSTUNREACH;
 455                 break;
 456         default:
 457                 goto out;
 458         }
 459
 460         switch (sk->sk_state) {
 461                 struct request_sock *req, **prev;
 462         case TCP_LISTEN:
 463                 if (sock_owned_by_user(sk))
 464                         goto out;
 465
 466                 req = inet_csk_search_req(sk, &prev, th->dest,
 467                                           iph->daddr, iph->saddr);
 468                 if (!req)
 469                         goto out;
 470
 471                 /* ICMPs are not backlogged, hence we cannot get
 472                    an established socket here.
 473                  */
 474                 WARN_ON(req->sk);
 475
 476                 if (seq != tcp_rsk(req)->snt_isn) {
 477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                         goto out;
 479                 }
 480
 481                 /*
 482                  * Still in SYN_RECV, just remove it silently.
 483                  * There is no good way to pass the error to the newly
 484                  * created socket, and POSIX does not want network
 485                  * errors returned from accept().
 486                  */
 487                 inet_csk_reqsk_queue_drop(sk, req, prev);
 488                 goto out;
 489
 490         case TCP_SYN_SENT:
 491         case TCP_SYN_RECV:  /* Cannot happen.
 492                                It can f.e. if SYNs crossed.
 493                              */
 494                 if (!sock_owned_by_user(sk)) {
 495                         sk->sk_err = err;
 496
 497                         sk->sk_error_report(sk);
 498
 499                         tcp_done(sk);
 500                 } else {
 501                         sk->sk_err_soft = err;
 502                 }
 503                 goto out;
 504         }
 505
 506         /* If we've already connected we will keep trying
 507          * until we time out, or the user gives up.
 508          *
 509          * rfc1122 4.2.3.9 allows to consider as hard errors
 510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 511          * but it is obsoleted by pmtu discovery).
 512          *
 513          * Note, that in modern internet, where routing is unreliable
 514          * and in each dark corner broken firewalls sit, sending random
 515          * errors ordered by their masters even this two messages finally lose
 516          * their original sense (even Linux sends invalid PORT_UNREACHs)
 517          *
 518          * Now we are in compliance with RFCs.
 519          *                                                      --ANK (980905)
 520          */
 521
 522         inet = inet_sk(sk);
 523         if (!sock_owned_by_user(sk) && inet->recverr) {
 524                 sk->sk_err = err;
 525                 sk->sk_error_report(sk);
 526         } else  { /* Only an error on timeout */
 527                 sk->sk_err_soft = err;
 528         }
 529
 530 out:
 531         bh_unlock_sock(sk);
 532         sock_put(sk);
 533 }
 534
 535 static void __tcp_v4_send_check(struct sk_buff *skb,
 536                                 __be32 saddr, __be32 daddr)
 537 {
 538         struct tcphdr *th = tcp_hdr(skb);
 539
 540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 542                 skb->csum_start = skb_transport_header(skb) - skb->head;
 543                 skb->csum_offset = offsetof(struct tcphdr, check);
 544         } else {
 545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 546                                          csum_partial(th,
 547                                                       th->doff << 2,
 548                                                       skb->csum));
 549         }
 550 }
 551
 552 /* This routine computes an IPv4 TCP checksum. */
 553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556
 557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 558 }
 559 EXPORT_SYMBOL(tcp_v4_send_check);
 560
 561 int tcp_v4_gso_send_check(struct sk_buff *skb)
 562 {
 563         const struct iphdr *iph;
 564         struct tcphdr *th;
 565
 566         if (!pskb_may_pull(skb, sizeof(*th)))
 567                 return -EINVAL;
 568
 569         iph = ip_hdr(skb);
 570         th = tcp_hdr(skb);
 571
 572         th->check = 0;
 573         skb->ip_summed = CHECKSUM_PARTIAL;
 574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 575         return 0;
 576 }
 577
 578 /*
 579  *      This routine will send an RST to the other tcp.
 580  *
 581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 582  *                    for reset.
 583  *      Answer: if a packet caused RST, it is not for a socket
 584  *              existing in our system, if it is matched to a socket,
 585  *              it is just duplicate segment or bug in other side's TCP.
 586  *              So that we build reply only basing on parameters
 587  *              arrived with segment.
 588  *      Exception: precedence violation. We do not implement it in any case.
 589  */
 590
 591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 592 {
 593         struct tcphdr *th = tcp_hdr(skb);
 594         struct {
 595                 struct tcphdr th;
 596 #ifdef CONFIG_TCP_MD5SIG
 597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 598 #endif
 599         } rep;
 600         struct ip_reply_arg arg;
 601 #ifdef CONFIG_TCP_MD5SIG
 602         struct tcp_md5sig_key *key;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 611                 return;
 612
 613         /* Swap the send and the receive. */
 614         memset(&rep, 0, sizeof(rep));
 615         rep.th.dest   = th->source;
 616         rep.th.source = th->dest;
 617         rep.th.doff   = sizeof(struct tcphdr) / 4;
 618         rep.th.rst    = 1;
 619
 620         if (th->ack) {
 621                 rep.th.seq = th->ack_seq;
 622         } else {
 623                 rep.th.ack = 1;
 624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 625                                        skb->len - (th->doff << 2));
 626         }
 627
 628         memset(&arg, 0, sizeof(arg));
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631
 632 #ifdef CONFIG_TCP_MD5SIG
 633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 634         if (key) {
 635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 636                                    (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_MD5SIG << 8) |
 638                                    TCPOLEN_MD5SIG);
 639                 /* Update length and the length the header thinks exists */
 640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 641                 rep.th.doff = arg.iov[0].iov_len / 4;
 642
 643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 644                                      key, ip_hdr(skb)->saddr,
 645                                      ip_hdr(skb)->daddr, &rep.th);
 646         }
 647 #endif
 648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 649                                       ip_hdr(skb)->saddr, /* XXX */
 650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 653
 654         net = dev_net(skb_dst(skb)->dev);
 655         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 656                       &arg, arg.iov[0].iov_len);
 657
 658         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 659         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 660 }
 661
 662 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 663    outside socket context is ugly, certainly. What can I do?
 664  */
 665
 666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 667                             u32 win, u32 ts, int oif,
 668                             struct tcp_md5sig_key *key,
 669                             int reply_flags)
 670 {
 671         struct tcphdr *th = tcp_hdr(skb);
 672         struct {
 673                 struct tcphdr th;
 674                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 675 #ifdef CONFIG_TCP_MD5SIG
 676                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 677 #endif
 678                         ];
 679         } rep;
 680         struct ip_reply_arg arg;
 681         struct net *net = dev_net(skb_dst(skb)->dev);
 682
 683         memset(&rep.th, 0, sizeof(struct tcphdr));
 684         memset(&arg, 0, sizeof(arg));
 685
 686         arg.iov[0].iov_base = (unsigned char *)&rep;
 687         arg.iov[0].iov_len  = sizeof(rep.th);
 688         if (ts) {
 689                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 690                                    (TCPOPT_TIMESTAMP << 8) |
 691                                    TCPOLEN_TIMESTAMP);
 692                 rep.opt[1] = htonl(tcp_time_stamp);
 693                 rep.opt[2] = htonl(ts);
 694                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 695         }
 696
 697         /* Swap the send and the receive. */
 698         rep.th.dest    = th->source;
 699         rep.th.source  = th->dest;
 700         rep.th.doff    = arg.iov[0].iov_len / 4;
 701         rep.th.seq     = htonl(seq);
 702         rep.th.ack_seq = htonl(ack);
 703         rep.th.ack     = 1;
 704         rep.th.window  = htons(win);
 705
 706 #ifdef CONFIG_TCP_MD5SIG
 707         if (key) {
 708                 int offset = (ts) ? 3 : 0;
 709
 710                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 711                                           (TCPOPT_NOP << 16) |
 712                                           (TCPOPT_MD5SIG << 8) |
 713                                           TCPOLEN_MD5SIG);
 714                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 715                 rep.th.doff = arg.iov[0].iov_len/4;
 716
 717                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 718                                     key, ip_hdr(skb)->saddr,
 719                                     ip_hdr(skb)->daddr, &rep.th);
 720         }
 721 #endif
 722         arg.flags = reply_flags;
 723         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 724                                       ip_hdr(skb)->saddr, /* XXX */
 725                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 726         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 727         if (oif)
 728                 arg.bound_dev_if = oif;
 729
 730         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 731                       &arg, arg.iov[0].iov_len);
 732
 733         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 734 }
 735
 736 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 737 {
 738         struct inet_timewait_sock *tw = inet_twsk(sk);
 739         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 740
 741         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 742                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 743                         tcptw->tw_ts_recent,
 744                         tw->tw_bound_dev_if,
 745                         tcp_twsk_md5_key(tcptw),
 746                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 747                         );
 748
 749         inet_twsk_put(tw);
 750 }
 751
 752 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 753                                   struct request_sock *req)
 754 {
 755         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 756                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 757                         req->ts_recent,
 758                         0,
 759                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 760                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 761 }
 762
 763 /*
 764  *      Send a SYN-ACK after having received a SYN.
 765  *      This still operates on a request_sock only, not on a big
 766  *      socket.
 767  */
 768 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 769                               struct request_sock *req,
 770                               struct request_values *rvp)
 771 {
 772         const struct inet_request_sock *ireq = inet_rsk(req);
 773         struct flowi4 fl4;
 774         int err = -1;
 775         struct sk_buff * skb;
 776
 777         /* First, grab a route. */
 778         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 779                 return -1;
 780
 781         skb = tcp_make_synack(sk, dst, req, rvp);
 782
 783         if (skb) {
 784                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 785
 786                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 787                                             ireq->rmt_addr,
 788                                             ireq->opt);
 789                 err = net_xmit_eval(err);
 790         }
 791
 792         dst_release(dst);
 793         return err;
 794 }
 795
 796 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 797                               struct request_values *rvp)
 798 {
 799         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 800         return tcp_v4_send_synack(sk, NULL, req, rvp);
 801 }
 802
 803 /*
 804  *      IPv4 request_sock destructor.
 805  */
 806 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 807 {
 808         kfree(inet_rsk(req)->opt);
 809 }
 810
 811 static void syn_flood_warning(const struct sk_buff *skb)
 812 {
 813         const char *msg;
 814
 815 #ifdef CONFIG_SYN_COOKIES
 816         if (sysctl_tcp_syncookies)
 817                 msg = "Sending cookies";
 818         else
 819 #endif
 820                 msg = "Dropping request";
 821
 822         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 823                                 ntohs(tcp_hdr(skb)->dest), msg);
 824 }
 825
 826 /*
 827  * Save and compile IPv4 options into the request_sock if needed.
 828  */
 829 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 830                                                   struct sk_buff *skb)
 831 {
 832         const struct ip_options *opt = &(IPCB(skb)->opt);
 833         struct ip_options_rcu *dopt = NULL;
 834
 835         if (opt && opt->optlen) {
 836                 int opt_size = sizeof(*dopt) + opt->optlen;
 837
 838                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 839                 if (dopt) {
 840                         if (ip_options_echo(&dopt->opt, skb)) {
 841                                 kfree(dopt);
 842                                 dopt = NULL;
 843                         }
 844                 }
 845         }
 846         return dopt;
 847 }
 848
 849 #ifdef CONFIG_TCP_MD5SIG
 850 /*
 851  * RFC2385 MD5 checksumming requires a mapping of
 852  * IP address->MD5 Key.
 853  * We need to maintain these in the sk structure.
 854  */
 855
 856 /* Find the Key structure for an address.  */
 857 static struct tcp_md5sig_key *
 858                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 859 {
 860         struct tcp_sock *tp = tcp_sk(sk);
 861         int i;
 862
 863         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 864                 return NULL;
 865         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 866                 if (tp->md5sig_info->keys4[i].addr == addr)
 867                         return &tp->md5sig_info->keys4[i].base;
 868         }
 869         return NULL;
 870 }
 871
 872 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 873                                          struct sock *addr_sk)
 874 {
 875         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 876 }
 877 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 878
 879 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 880                                                       struct request_sock *req)
 881 {
 882         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 883 }
 884
 885 /* This can be called on a newly created socket, from other files */
 886 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 887                       u8 *newkey, u8 newkeylen)
 888 {
 889         /* Add Key to the list */
 890         struct tcp_md5sig_key *key;
 891         struct tcp_sock *tp = tcp_sk(sk);
 892         struct tcp4_md5sig_key *keys;
 893
 894         key = tcp_v4_md5_do_lookup(sk, addr);
 895         if (key) {
 896                 /* Pre-existing entry - just update that one. */
 897                 kfree(key->key);
 898                 key->key = newkey;
 899                 key->keylen = newkeylen;
 900         } else {
 901                 struct tcp_md5sig_info *md5sig;
 902
 903                 if (!tp->md5sig_info) {
 904                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 905                                                   GFP_ATOMIC);
 906                         if (!tp->md5sig_info) {
 907                                 kfree(newkey);
 908                                 return -ENOMEM;
 909                         }
 910                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 911                 }
 912                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 913                         kfree(newkey);
 914                         return -ENOMEM;
 915                 }
 916                 md5sig = tp->md5sig_info;
 917
 918                 if (md5sig->alloced4 == md5sig->entries4) {
 919                         keys = kmalloc((sizeof(*keys) *
 920                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 921                         if (!keys) {
 922                                 kfree(newkey);
 923                                 tcp_free_md5sig_pool();
 924                                 return -ENOMEM;
 925                         }
 926
 927                         if (md5sig->entries4)
 928                                 memcpy(keys, md5sig->keys4,
 929                                        sizeof(*keys) * md5sig->entries4);
 930
 931                         /* Free old key list, and reference new one */
 932                         kfree(md5sig->keys4);
 933                         md5sig->keys4 = keys;
 934                         md5sig->alloced4++;
 935                 }
 936                 md5sig->entries4++;
 937                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 938                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 939                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 940         }
 941         return 0;
 942 }
 943 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 944
 945 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 946                                u8 *newkey, u8 newkeylen)
 947 {
 948         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 949                                  newkey, newkeylen);
 950 }
 951
 952 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 953 {
 954         struct tcp_sock *tp = tcp_sk(sk);
 955         int i;
 956
 957         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 958                 if (tp->md5sig_info->keys4[i].addr == addr) {
 959                         /* Free the key */
 960                         kfree(tp->md5sig_info->keys4[i].base.key);
 961                         tp->md5sig_info->entries4--;
 962
 963                         if (tp->md5sig_info->entries4 == 0) {
 964                                 kfree(tp->md5sig_info->keys4);
 965                                 tp->md5sig_info->keys4 = NULL;
 966                                 tp->md5sig_info->alloced4 = 0;
 967                         } else if (tp->md5sig_info->entries4 != i) {
 968                                 /* Need to do some manipulation */
 969                                 memmove(&tp->md5sig_info->keys4[i],
 970                                         &tp->md5sig_info->keys4[i+1],
 971                                         (tp->md5sig_info->entries4 - i) *
 972                                          sizeof(struct tcp4_md5sig_key));
 973                         }
 974                         tcp_free_md5sig_pool();
 975                         return 0;
 976                 }
 977         }
 978         return -ENOENT;
 979 }
 980 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 981
 982 static void tcp_v4_clear_md5_list(struct sock *sk)
 983 {
 984         struct tcp_sock *tp = tcp_sk(sk);
 985
 986         /* Free each key, then the set of key keys,
 987          * the crypto element, and then decrement our
 988          * hold on the last resort crypto.
 989          */
 990         if (tp->md5sig_info->entries4) {
 991                 int i;
 992                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 993                         kfree(tp->md5sig_info->keys4[i].base.key);
 994                 tp->md5sig_info->entries4 = 0;
 995                 tcp_free_md5sig_pool();
 996         }
 997         if (tp->md5sig_info->keys4) {
 998                 kfree(tp->md5sig_info->keys4);
 999                 tp->md5sig_info->keys4 = NULL;
1000                 tp->md5sig_info->alloced4  = 0;
1001         }
1002 }
1003
1004 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1005                                  int optlen)
1006 {
1007         struct tcp_md5sig cmd;
1008         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1009         u8 *newkey;
1010
1011         if (optlen < sizeof(cmd))
1012                 return -EINVAL;
1013
1014         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1015                 return -EFAULT;
1016
1017         if (sin->sin_family != AF_INET)
1018                 return -EINVAL;
1019
1020         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1021                 if (!tcp_sk(sk)->md5sig_info)
1022                         return -ENOENT;
1023                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1024         }
1025
1026         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1027                 return -EINVAL;
1028
1029         if (!tcp_sk(sk)->md5sig_info) {
1030                 struct tcp_sock *tp = tcp_sk(sk);
1031                 struct tcp_md5sig_info *p;
1032
1033                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1034                 if (!p)
1035                         return -EINVAL;
1036
1037                 tp->md5sig_info = p;
1038                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1039         }
1040
1041         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1042         if (!newkey)
1043                 return -ENOMEM;
1044         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1045                                  newkey, cmd.tcpm_keylen);
1046 }
1047
1048 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1049                                         __be32 daddr, __be32 saddr, int nbytes)
1050 {
1051         struct tcp4_pseudohdr *bp;
1052         struct scatterlist sg;
1053
1054         bp = &hp->md5_blk.ip4;
1055
1056         /*
1057          * 1. the TCP pseudo-header (in the order: source IP address,
1058          * destination IP address, zero-padded protocol number, and
1059          * segment length)
1060          */
1061         bp->saddr = saddr;
1062         bp->daddr = daddr;
1063         bp->pad = 0;
1064         bp->protocol = IPPROTO_TCP;
1065         bp->len = cpu_to_be16(nbytes);
1066
1067         sg_init_one(&sg, bp, sizeof(*bp));
1068         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1069 }
1070
1071 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1072                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1073 {
1074         struct tcp_md5sig_pool *hp;
1075         struct hash_desc *desc;
1076
1077         hp = tcp_get_md5sig_pool();
1078         if (!hp)
1079                 goto clear_hash_noput;
1080         desc = &hp->md5_desc;
1081
1082         if (crypto_hash_init(desc))
1083                 goto clear_hash;
1084         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1085                 goto clear_hash;
1086         if (tcp_md5_hash_header(hp, th))
1087                 goto clear_hash;
1088         if (tcp_md5_hash_key(hp, key))
1089                 goto clear_hash;
1090         if (crypto_hash_final(desc, md5_hash))
1091                 goto clear_hash;
1092
1093         tcp_put_md5sig_pool();
1094         return 0;
1095
1096 clear_hash:
1097         tcp_put_md5sig_pool();
1098 clear_hash_noput:
1099         memset(md5_hash, 0, 16);
1100         return 1;
1101 }
1102
1103 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1104                         struct sock *sk, struct request_sock *req,
1105                         struct sk_buff *skb)
1106 {
1107         struct tcp_md5sig_pool *hp;
1108         struct hash_desc *desc;
1109         struct tcphdr *th = tcp_hdr(skb);
1110         __be32 saddr, daddr;
1111
1112         if (sk) {
1113                 saddr = inet_sk(sk)->inet_saddr;
1114                 daddr = inet_sk(sk)->inet_daddr;
1115         } else if (req) {
1116                 saddr = inet_rsk(req)->loc_addr;
1117                 daddr = inet_rsk(req)->rmt_addr;
1118         } else {
1119                 const struct iphdr *iph = ip_hdr(skb);
1120                 saddr = iph->saddr;
1121                 daddr = iph->daddr;
1122         }
1123
1124         hp = tcp_get_md5sig_pool();
1125         if (!hp)
1126                 goto clear_hash_noput;
1127         desc = &hp->md5_desc;
1128
1129         if (crypto_hash_init(desc))
1130                 goto clear_hash;
1131
1132         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1133                 goto clear_hash;
1134         if (tcp_md5_hash_header(hp, th))
1135                 goto clear_hash;
1136         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1137                 goto clear_hash;
1138         if (tcp_md5_hash_key(hp, key))
1139                 goto clear_hash;
1140         if (crypto_hash_final(desc, md5_hash))
1141                 goto clear_hash;
1142
1143         tcp_put_md5sig_pool();
1144         return 0;
1145
1146 clear_hash:
1147         tcp_put_md5sig_pool();
1148 clear_hash_noput:
1149         memset(md5_hash, 0, 16);
1150         return 1;
1151 }
1152 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1153
1154 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1155 {
1156         /*
1157          * This gets called for each TCP segment that arrives
1158          * so we want to be efficient.
1159          * We have 3 drop cases:
1160          * o No MD5 hash and one expected.
1161          * o MD5 hash and we're not expecting one.
1162          * o MD5 hash and its wrong.
1163          */
1164         __u8 *hash_location = NULL;
1165         struct tcp_md5sig_key *hash_expected;
1166         const struct iphdr *iph = ip_hdr(skb);
1167         struct tcphdr *th = tcp_hdr(skb);
1168         int genhash;
1169         unsigned char newhash[16];
1170
1171         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1172         hash_location = tcp_parse_md5sig_option(th);
1173
1174         /* We've parsed the options - do we have a hash? */
1175         if (!hash_expected && !hash_location)
1176                 return 0;
1177
1178         if (hash_expected && !hash_location) {
1179                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1180                 return 1;
1181         }
1182
1183         if (!hash_expected && hash_location) {
1184                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1185                 return 1;
1186         }
1187
1188         /* Okay, so this is hash_expected and hash_location -
1189          * so we need to calculate the checksum.
1190          */
1191         genhash = tcp_v4_md5_hash_skb(newhash,
1192                                       hash_expected,
1193                                       NULL, NULL, skb);
1194
1195         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1196                 if (net_ratelimit()) {
1197                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1198                                &iph->saddr, ntohs(th->source),
1199                                &iph->daddr, ntohs(th->dest),
1200                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1201                 }
1202                 return 1;
1203         }
1204         return 0;
1205 }
1206
1207 #endif
1208
1209 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1210         .family         =       PF_INET,
1211         .obj_size       =       sizeof(struct tcp_request_sock),
1212         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1213         .send_ack       =       tcp_v4_reqsk_send_ack,
1214         .destructor     =       tcp_v4_reqsk_destructor,
1215         .send_reset     =       tcp_v4_send_reset,
1216         .syn_ack_timeout =      tcp_syn_ack_timeout,
1217 };
1218
1219 #ifdef CONFIG_TCP_MD5SIG
1220 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1221         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1222         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1223 };
1224 #endif
1225
1226 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1227 {
1228         struct tcp_extend_values tmp_ext;
1229         struct tcp_options_received tmp_opt;
1230         u8 *hash_location;
1231         struct request_sock *req;
1232         struct inet_request_sock *ireq;
1233         struct tcp_sock *tp = tcp_sk(sk);
1234         struct dst_entry *dst = NULL;
1235         __be32 saddr = ip_hdr(skb)->saddr;
1236         __be32 daddr = ip_hdr(skb)->daddr;
1237         __u32 isn = TCP_SKB_CB(skb)->when;
1238 #ifdef CONFIG_SYN_COOKIES
1239         int want_cookie = 0;
1240 #else
1241 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1242 #endif
1243
1244         /* Never answer to SYNs send to broadcast or multicast */
1245         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1246                 goto drop;
1247
1248         /* TW buckets are converted to open requests without
1249          * limitations, they conserve resources and peer is
1250          * evidently real one.
1251          */
1252         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1253                 if (net_ratelimit())
1254                         syn_flood_warning(skb);
1255 #ifdef CONFIG_SYN_COOKIES
1256                 if (sysctl_tcp_syncookies) {
1257                         want_cookie = 1;
1258                 } else
1259 #endif
1260                 goto drop;
1261         }
1262
1263         /* Accept backlog is full. If we have already queued enough
1264          * of warm entries in syn queue, drop request. It is better than
1265          * clogging syn queue with openreqs with exponentially increasing
1266          * timeout.
1267          */
1268         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1269                 goto drop;
1270
1271         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1272         if (!req)
1273                 goto drop;
1274
1275 #ifdef CONFIG_TCP_MD5SIG
1276         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1277 #endif
1278
1279         tcp_clear_options(&tmp_opt);
1280         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1281         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1282         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1283
1284         if (tmp_opt.cookie_plus > 0 &&
1285             tmp_opt.saw_tstamp &&
1286             !tp->rx_opt.cookie_out_never &&
1287             (sysctl_tcp_cookie_size > 0 ||
1288              (tp->cookie_values != NULL &&
1289               tp->cookie_values->cookie_desired > 0))) {
1290                 u8 *c;
1291                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1292                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1293
1294                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1295                         goto drop_and_release;
1296
1297                 /* Secret recipe starts with IP addresses */
1298                 *mess++ ^= (__force u32)daddr;
1299                 *mess++ ^= (__force u32)saddr;
1300
1301                 /* plus variable length Initiator Cookie */
1302                 c = (u8 *)mess;
1303                 while (l-- > 0)
1304                         *c++ ^= *hash_location++;
1305
1306 #ifdef CONFIG_SYN_COOKIES
1307                 want_cookie = 0;        /* not our kind of cookie */
1308 #endif
1309                 tmp_ext.cookie_out_never = 0; /* false */
1310                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1311         } else if (!tp->rx_opt.cookie_in_always) {
1312                 /* redundant indications, but ensure initialization. */
1313                 tmp_ext.cookie_out_never = 1; /* true */
1314                 tmp_ext.cookie_plus = 0;
1315         } else {
1316                 goto drop_and_release;
1317         }
1318         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1319
1320         if (want_cookie && !tmp_opt.saw_tstamp)
1321                 tcp_clear_options(&tmp_opt);
1322
1323         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1324         tcp_openreq_init(req, &tmp_opt, skb);
1325
1326         ireq = inet_rsk(req);
1327         ireq->loc_addr = daddr;
1328         ireq->rmt_addr = saddr;
1329         ireq->no_srccheck = inet_sk(sk)->transparent;
1330         ireq->opt = tcp_v4_save_options(sk, skb);
1331
1332         if (security_inet_conn_request(sk, skb, req))
1333                 goto drop_and_free;
1334
1335         if (!want_cookie || tmp_opt.tstamp_ok)
1336                 TCP_ECN_create_request(req, tcp_hdr(skb));
1337
1338         if (want_cookie) {
1339                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1340                 req->cookie_ts = tmp_opt.tstamp_ok;
1341         } else if (!isn) {
1342                 struct inet_peer *peer = NULL;
1343                 struct flowi4 fl4;
1344
1345                 /* VJ's idea. We save last timestamp seen
1346                  * from the destination in peer table, when entering
1347                  * state TIME-WAIT, and check against it before
1348                  * accepting new connection request.
1349                  *
1350                  * If "isn" is not zero, this request hit alive
1351                  * timewait bucket, so that all the necessary checks
1352                  * are made in the function processing timewait state.
1353                  */
1354                 if (tmp_opt.saw_tstamp &&
1355                     tcp_death_row.sysctl_tw_recycle &&
1356                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1357                     fl4.daddr == saddr &&
1358                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1359                         inet_peer_refcheck(peer);
1360                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1361                             (s32)(peer->tcp_ts - req->ts_recent) >
1362                                                         TCP_PAWS_WINDOW) {
1363                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1364                                 goto drop_and_release;
1365                         }
1366                 }
1367                 /* Kill the following clause, if you dislike this way. */
1368                 else if (!sysctl_tcp_syncookies &&
1369                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1370                           (sysctl_max_syn_backlog >> 2)) &&
1371                          (!peer || !peer->tcp_ts_stamp) &&
1372                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1373                         /* Without syncookies last quarter of
1374                          * backlog is filled with destinations,
1375                          * proven to be alive.
1376                          * It means that we continue to communicate
1377                          * to destinations, already remembered
1378                          * to the moment of synflood.
1379                          */
1380                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1381                                        &saddr, ntohs(tcp_hdr(skb)->source));
1382                         goto drop_and_release;
1383                 }
1384
1385                 isn = tcp_v4_init_sequence(skb);
1386         }
1387         tcp_rsk(req)->snt_isn = isn;
1388
1389         if (tcp_v4_send_synack(sk, dst, req,
1390                                (struct request_values *)&tmp_ext) ||
1391             want_cookie)
1392                 goto drop_and_free;
1393
1394         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1395         return 0;
1396
1397 drop_and_release:
1398         dst_release(dst);
1399 drop_and_free:
1400         reqsk_free(req);
1401 drop:
1402         return 0;
1403 }
1404 EXPORT_SYMBOL(tcp_v4_conn_request);
1405
1406
1407 /*
1408  * The three way handshake has completed - we got a valid synack -
1409  * now create the new socket.
1410  */
1411 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1412                                   struct request_sock *req,
1413                                   struct dst_entry *dst)
1414 {
1415         struct inet_request_sock *ireq;
1416         struct inet_sock *newinet;
1417         struct tcp_sock *newtp;
1418         struct sock *newsk;
1419 #ifdef CONFIG_TCP_MD5SIG
1420         struct tcp_md5sig_key *key;
1421 #endif
1422         struct ip_options_rcu *inet_opt;
1423
1424         if (sk_acceptq_is_full(sk))
1425                 goto exit_overflow;
1426
1427         newsk = tcp_create_openreq_child(sk, req, skb);
1428         if (!newsk)
1429                 goto exit_nonewsk;
1430
1431         newsk->sk_gso_type = SKB_GSO_TCPV4;
1432
1433         newtp                 = tcp_sk(newsk);
1434         newinet               = inet_sk(newsk);
1435         ireq                  = inet_rsk(req);
1436         newinet->inet_daddr   = ireq->rmt_addr;
1437         newinet->inet_rcv_saddr = ireq->loc_addr;
1438         newinet->inet_saddr           = ireq->loc_addr;
1439         inet_opt              = ireq->opt;
1440         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1441         ireq->opt             = NULL;
1442         newinet->mc_index     = inet_iif(skb);
1443         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1444         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1445         if (inet_opt)
1446                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1447         newinet->inet_id = newtp->write_seq ^ jiffies;
1448
1449         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1450                 goto put_and_exit;
1451
1452         sk_setup_caps(newsk, dst);
1453
1454         tcp_mtup_init(newsk);
1455         tcp_sync_mss(newsk, dst_mtu(dst));
1456         newtp->advmss = dst_metric_advmss(dst);
1457         if (tcp_sk(sk)->rx_opt.user_mss &&
1458             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1459                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1460
1461         tcp_initialize_rcv_mss(newsk);
1462
1463 #ifdef CONFIG_TCP_MD5SIG
1464         /* Copy over the MD5 key from the original socket */
1465         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1466         if (key != NULL) {
1467                 /*
1468                  * We're using one, so create a matching key
1469                  * on the newsk structure. If we fail to get
1470                  * memory, then we end up not copying the key
1471                  * across. Shucks.
1472                  */
1473                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1474                 if (newkey != NULL)
1475                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1476                                           newkey, key->keylen);
1477                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1478         }
1479 #endif
1480
1481         if (__inet_inherit_port(sk, newsk) < 0)
1482                 goto put_and_exit;
1483         __inet_hash_nolisten(newsk, NULL);
1484
1485         return newsk;
1486
1487 exit_overflow:
1488         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1489 exit_nonewsk:
1490         dst_release(dst);
1491 exit:
1492         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1493         return NULL;
1494 put_and_exit:
1495         sock_put(newsk);
1496         goto exit;
1497 }
1498 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1499
1500 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1501 {
1502         struct tcphdr *th = tcp_hdr(skb);
1503         const struct iphdr *iph = ip_hdr(skb);
1504         struct sock *nsk;
1505         struct request_sock **prev;
1506         /* Find possible connection requests. */
1507         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1508                                                        iph->saddr, iph->daddr);
1509         if (req)
1510                 return tcp_check_req(sk, skb, req, prev);
1511
1512         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1513                         th->source, iph->daddr, th->dest, inet_iif(skb));
1514
1515         if (nsk) {
1516                 if (nsk->sk_state != TCP_TIME_WAIT) {
1517                         bh_lock_sock(nsk);
1518                         return nsk;
1519                 }
1520                 inet_twsk_put(inet_twsk(nsk));
1521                 return NULL;
1522         }
1523
1524 #ifdef CONFIG_SYN_COOKIES
1525         if (!th->syn)
1526                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1527 #endif
1528         return sk;
1529 }
1530
1531 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1532 {
1533         const struct iphdr *iph = ip_hdr(skb);
1534
1535         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1536                 if (!tcp_v4_check(skb->len, iph->saddr,
1537                                   iph->daddr, skb->csum)) {
1538                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1539                         return 0;
1540                 }
1541         }
1542
1543         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1544                                        skb->len, IPPROTO_TCP, 0);
1545
1546         if (skb->len <= 76) {
1547                 return __skb_checksum_complete(skb);
1548         }
1549         return 0;
1550 }
1551
1552
1553 /* The socket must have it's spinlock held when we get
1554  * here.
1555  *
1556  * We have a potential double-lock case here, so even when
1557  * doing backlog processing we use the BH locking scheme.
1558  * This is because we cannot sleep with the original spinlock
1559  * held.
1560  */
1561 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1562 {
1563         struct sock *rsk;
1564 #ifdef CONFIG_TCP_MD5SIG
1565         /*
1566          * We really want to reject the packet as early as possible
1567          * if:
1568          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1569          *  o There is an MD5 option and we're not expecting one
1570          */
1571         if (tcp_v4_inbound_md5_hash(sk, skb))
1572                 goto discard;
1573 #endif
1574
1575         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1576                 sock_rps_save_rxhash(sk, skb->rxhash);
1577                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1578                         rsk = sk;
1579                         goto reset;
1580                 }
1581                 return 0;
1582         }
1583
1584         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1585                 goto csum_err;
1586
1587         if (sk->sk_state == TCP_LISTEN) {
1588                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1589                 if (!nsk)
1590                         goto discard;
1591
1592                 if (nsk != sk) {
1593                         sock_rps_save_rxhash(nsk, skb->rxhash);
1594                         if (tcp_child_process(sk, nsk, skb)) {
1595                                 rsk = nsk;
1596                                 goto reset;
1597                         }
1598                         return 0;
1599                 }
1600         } else
1601                 sock_rps_save_rxhash(sk, skb->rxhash);
1602
1603         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1604                 rsk = sk;
1605                 goto reset;
1606         }
1607         return 0;
1608
1609 reset:
1610         tcp_v4_send_reset(rsk, skb);
1611 discard:
1612         kfree_skb(skb);
1613         /* Be careful here. If this function gets more complicated and
1614          * gcc suffers from register pressure on the x86, sk (in %ebx)
1615          * might be destroyed here. This current version compiles correctly,
1616          * but you have been warned.
1617          */
1618         return 0;
1619
1620 csum_err:
1621         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1622         goto discard;
1623 }
1624 EXPORT_SYMBOL(tcp_v4_do_rcv);
1625
1626 /*
1627  *      From tcp_input.c
1628  */
1629
1630 int tcp_v4_rcv(struct sk_buff *skb)
1631 {
1632         const struct iphdr *iph;
1633         struct tcphdr *th;
1634         struct sock *sk;
1635         int ret;
1636         struct net *net = dev_net(skb->dev);
1637
1638         if (skb->pkt_type != PACKET_HOST)
1639                 goto discard_it;
1640
1641         /* Count it even if it's bad */
1642         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1643
1644         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1645                 goto discard_it;
1646
1647         th = tcp_hdr(skb);
1648
1649         if (th->doff < sizeof(struct tcphdr) / 4)
1650                 goto bad_packet;
1651         if (!pskb_may_pull(skb, th->doff * 4))
1652                 goto discard_it;
1653
1654         /* An explanation is required here, I think.
1655          * Packet length and doff are validated by header prediction,
1656          * provided case of th->doff==0 is eliminated.
1657          * So, we defer the checks. */
1658         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1659                 goto bad_packet;
1660
1661         th = tcp_hdr(skb);
1662         iph = ip_hdr(skb);
1663         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1664         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1665                                     skb->len - th->doff * 4);
1666         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1667         TCP_SKB_CB(skb)->when    = 0;
1668         TCP_SKB_CB(skb)->flags   = iph->tos;
1669         TCP_SKB_CB(skb)->sacked  = 0;
1670
1671         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1672         if (!sk)
1673                 goto no_tcp_socket;
1674
1675 process:
1676         if (sk->sk_state == TCP_TIME_WAIT)
1677                 goto do_time_wait;
1678
1679         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1680                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1681                 goto discard_and_relse;
1682         }
1683
1684         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1685                 goto discard_and_relse;
1686         nf_reset(skb);
1687
1688         if (sk_filter(sk, skb))
1689                 goto discard_and_relse;
1690
1691         skb->dev = NULL;
1692
1693         bh_lock_sock_nested(sk);
1694         ret = 0;
1695         if (!sock_owned_by_user(sk)) {
1696 #ifdef CONFIG_NET_DMA
1697                 struct tcp_sock *tp = tcp_sk(sk);
1698                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1699                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1700                 if (tp->ucopy.dma_chan)
1701                         ret = tcp_v4_do_rcv(sk, skb);
1702                 else
1703 #endif
1704                 {
1705                         if (!tcp_prequeue(sk, skb))
1706                                 ret = tcp_v4_do_rcv(sk, skb);
1707                 }
1708         } else if (unlikely(sk_add_backlog(sk, skb))) {
1709                 bh_unlock_sock(sk);
1710                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1711                 goto discard_and_relse;
1712         }
1713         bh_unlock_sock(sk);
1714
1715         sock_put(sk);
1716
1717         return ret;
1718
1719 no_tcp_socket:
1720         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1721                 goto discard_it;
1722
1723         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1724 bad_packet:
1725                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1726         } else {
1727                 tcp_v4_send_reset(NULL, skb);
1728         }
1729
1730 discard_it:
1731         /* Discard frame. */
1732         kfree_skb(skb);
1733         return 0;
1734
1735 discard_and_relse:
1736         sock_put(sk);
1737         goto discard_it;
1738
1739 do_time_wait:
1740         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1741                 inet_twsk_put(inet_twsk(sk));
1742                 goto discard_it;
1743         }
1744
1745         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1746                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1747                 inet_twsk_put(inet_twsk(sk));
1748                 goto discard_it;
1749         }
1750         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1751         case TCP_TW_SYN: {
1752                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1753                                                         &tcp_hashinfo,
1754                                                         iph->daddr, th->dest,
1755                                                         inet_iif(skb));
1756                 if (sk2) {
1757                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1758                         inet_twsk_put(inet_twsk(sk));
1759                         sk = sk2;
1760                         goto process;
1761                 }
1762                 /* Fall through to ACK */
1763         }
1764         case TCP_TW_ACK:
1765                 tcp_v4_timewait_ack(sk, skb);
1766                 break;
1767         case TCP_TW_RST:
1768                 goto no_tcp_socket;
1769         case TCP_TW_SUCCESS:;
1770         }
1771         goto discard_it;
1772 }
1773
1774 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1775 {
1776         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1777         struct inet_sock *inet = inet_sk(sk);
1778         struct inet_peer *peer;
1779
1780         if (!rt ||
1781             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1782                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1783                 *release_it = true;
1784         } else {
1785                 if (!rt->peer)
1786                         rt_bind_peer(rt, inet->inet_daddr, 1);
1787                 peer = rt->peer;
1788                 *release_it = false;
1789         }
1790
1791         return peer;
1792 }
1793 EXPORT_SYMBOL(tcp_v4_get_peer);
1794
1795 void *tcp_v4_tw_get_peer(struct sock *sk)
1796 {
1797         struct inet_timewait_sock *tw = inet_twsk(sk);
1798
1799         return inet_getpeer_v4(tw->tw_daddr, 1);
1800 }
1801 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1802
1803 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1804         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1805         .twsk_unique    = tcp_twsk_unique,
1806         .twsk_destructor= tcp_twsk_destructor,
1807         .twsk_getpeer   = tcp_v4_tw_get_peer,
1808 };
1809
1810 const struct inet_connection_sock_af_ops ipv4_specific = {
1811         .queue_xmit        = ip_queue_xmit,
1812         .send_check        = tcp_v4_send_check,
1813         .rebuild_header    = inet_sk_rebuild_header,
1814         .conn_request      = tcp_v4_conn_request,
1815         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1816         .get_peer          = tcp_v4_get_peer,
1817         .net_header_len    = sizeof(struct iphdr),
1818         .setsockopt        = ip_setsockopt,
1819         .getsockopt        = ip_getsockopt,
1820         .addr2sockaddr     = inet_csk_addr2sockaddr,
1821         .sockaddr_len      = sizeof(struct sockaddr_in),
1822         .bind_conflict     = inet_csk_bind_conflict,
1823 #ifdef CONFIG_COMPAT
1824         .compat_setsockopt = compat_ip_setsockopt,
1825         .compat_getsockopt = compat_ip_getsockopt,
1826 #endif
1827 };
1828 EXPORT_SYMBOL(ipv4_specific);
1829
1830 #ifdef CONFIG_TCP_MD5SIG
1831 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1832         .md5_lookup             = tcp_v4_md5_lookup,
1833         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1834         .md5_add                = tcp_v4_md5_add_func,
1835         .md5_parse              = tcp_v4_parse_md5_keys,
1836 };
1837 #endif
1838
1839 /* NOTE: A lot of things set to zero explicitly by call to
1840  *       sk_alloc() so need not be done here.
1841  */
1842 static int tcp_v4_init_sock(struct sock *sk)
1843 {
1844         struct inet_connection_sock *icsk = inet_csk(sk);
1845         struct tcp_sock *tp = tcp_sk(sk);
1846
1847         skb_queue_head_init(&tp->out_of_order_queue);
1848         tcp_init_xmit_timers(sk);
1849         tcp_prequeue_init(tp);
1850
1851         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1852         tp->mdev = TCP_TIMEOUT_INIT;
1853
1854         /* So many TCP implementations out there (incorrectly) count the
1855          * initial SYN frame in their delayed-ACK and congestion control
1856          * algorithms that we must have the following bandaid to talk
1857          * efficiently to them.  -DaveM
1858          */
1859         tp->snd_cwnd = 2;
1860
1861         /* See draft-stevens-tcpca-spec-01 for discussion of the
1862          * initialization of these values.
1863          */
1864         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1865         tp->snd_cwnd_clamp = ~0;
1866         tp->mss_cache = TCP_MSS_DEFAULT;
1867
1868         tp->reordering = sysctl_tcp_reordering;
1869         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1870
1871         sk->sk_state = TCP_CLOSE;
1872
1873         sk->sk_write_space = sk_stream_write_space;
1874         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1875
1876         icsk->icsk_af_ops = &ipv4_specific;
1877         icsk->icsk_sync_mss = tcp_sync_mss;
1878 #ifdef CONFIG_TCP_MD5SIG
1879         tp->af_specific = &tcp_sock_ipv4_specific;
1880 #endif
1881
1882         /* TCP Cookie Transactions */
1883         if (sysctl_tcp_cookie_size > 0) {
1884                 /* Default, cookies without s_data_payload. */
1885                 tp->cookie_values =
1886                         kzalloc(sizeof(*tp->cookie_values),
1887                                 sk->sk_allocation);
1888                 if (tp->cookie_values != NULL)
1889                         kref_init(&tp->cookie_values->kref);
1890         }
1891         /* Presumed zeroed, in order of appearance:
1892          *      cookie_in_always, cookie_out_never,
1893          *      s_data_constant, s_data_in, s_data_out
1894          */
1895         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1896         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1897
1898         local_bh_disable();
1899         percpu_counter_inc(&tcp_sockets_allocated);
1900         local_bh_enable();
1901
1902         return 0;
1903 }
1904
1905 void tcp_v4_destroy_sock(struct sock *sk)
1906 {
1907         struct tcp_sock *tp = tcp_sk(sk);
1908
1909         tcp_clear_xmit_timers(sk);
1910
1911         tcp_cleanup_congestion_control(sk);
1912
1913         /* Cleanup up the write buffer. */
1914         tcp_write_queue_purge(sk);
1915
1916         /* Cleans up our, hopefully empty, out_of_order_queue. */
1917         __skb_queue_purge(&tp->out_of_order_queue);
1918
1919 #ifdef CONFIG_TCP_MD5SIG
1920         /* Clean up the MD5 key list, if any */
1921         if (tp->md5sig_info) {
1922                 tcp_v4_clear_md5_list(sk);
1923                 kfree(tp->md5sig_info);
1924                 tp->md5sig_info = NULL;
1925         }
1926 #endif
1927
1928 #ifdef CONFIG_NET_DMA
1929         /* Cleans up our sk_async_wait_queue */
1930         __skb_queue_purge(&sk->sk_async_wait_queue);
1931 #endif
1932
1933         /* Clean prequeue, it must be empty really */
1934         __skb_queue_purge(&tp->ucopy.prequeue);
1935
1936         /* Clean up a referenced TCP bind bucket. */
1937         if (inet_csk(sk)->icsk_bind_hash)
1938                 inet_put_port(sk);
1939
1940         /*
1941          * If sendmsg cached page exists, toss it.
1942          */
1943         if (sk->sk_sndmsg_page) {
1944                 __free_page(sk->sk_sndmsg_page);
1945                 sk->sk_sndmsg_page = NULL;
1946         }
1947
1948         /* TCP Cookie Transactions */
1949         if (tp->cookie_values != NULL) {
1950                 kref_put(&tp->cookie_values->kref,
1951                          tcp_cookie_values_release);
1952                 tp->cookie_values = NULL;
1953         }
1954
1955         percpu_counter_dec(&tcp_sockets_allocated);
1956 }
1957 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1958
1959 #ifdef CONFIG_PROC_FS
1960 /* Proc filesystem TCP sock list dumping. */
1961
1962 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1963 {
1964         return hlist_nulls_empty(head) ? NULL :
1965                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1966 }
1967
1968 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1969 {
1970         return !is_a_nulls(tw->tw_node.next) ?
1971                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1972 }
1973
1974 /*
1975  * Get next listener socket follow cur.  If cur is NULL, get first socket
1976  * starting from bucket given in st->bucket; when st->bucket is zero the
1977  * very first socket in the hash table is returned.
1978  */
1979 static void *listening_get_next(struct seq_file *seq, void *cur)
1980 {
1981         struct inet_connection_sock *icsk;
1982         struct hlist_nulls_node *node;
1983         struct sock *sk = cur;
1984         struct inet_listen_hashbucket *ilb;
1985         struct tcp_iter_state *st = seq->private;
1986         struct net *net = seq_file_net(seq);
1987
1988         if (!sk) {
1989                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1990                 spin_lock_bh(&ilb->lock);
1991                 sk = sk_nulls_head(&ilb->head);
1992                 st->offset = 0;
1993                 goto get_sk;
1994         }
1995         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1996         ++st->num;
1997         ++st->offset;
1998
1999         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2000                 struct request_sock *req = cur;
2001
2002                 icsk = inet_csk(st->syn_wait_sk);
2003                 req = req->dl_next;
2004                 while (1) {
2005                         while (req) {
2006                                 if (req->rsk_ops->family == st->family) {
2007                                         cur = req;
2008                                         goto out;
2009                                 }
2010                                 req = req->dl_next;
2011                         }
2012                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2013                                 break;
2014 get_req:
2015                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2016                 }
2017                 sk        = sk_nulls_next(st->syn_wait_sk);
2018                 st->state = TCP_SEQ_STATE_LISTENING;
2019                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2020         } else {
2021                 icsk = inet_csk(sk);
2022                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2023                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2024                         goto start_req;
2025                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2026                 sk = sk_nulls_next(sk);
2027         }
2028 get_sk:
2029         sk_nulls_for_each_from(sk, node) {
2030                 if (!net_eq(sock_net(sk), net))
2031                         continue;
2032                 if (sk->sk_family == st->family) {
2033                         cur = sk;
2034                         goto out;
2035                 }
2036                 icsk = inet_csk(sk);
2037                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2039 start_req:
2040                         st->uid         = sock_i_uid(sk);
2041                         st->syn_wait_sk = sk;
2042                         st->state       = TCP_SEQ_STATE_OPENREQ;
2043                         st->sbucket     = 0;
2044                         goto get_req;
2045                 }
2046                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2047         }
2048         spin_unlock_bh(&ilb->lock);
2049         st->offset = 0;
2050         if (++st->bucket < INET_LHTABLE_SIZE) {
2051                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2052                 spin_lock_bh(&ilb->lock);
2053                 sk = sk_nulls_head(&ilb->head);
2054                 goto get_sk;
2055         }
2056         cur = NULL;
2057 out:
2058         return cur;
2059 }
2060
2061 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2062 {
2063         struct tcp_iter_state *st = seq->private;
2064         void *rc;
2065
2066         st->bucket = 0;
2067         st->offset = 0;
2068         rc = listening_get_next(seq, NULL);
2069
2070         while (rc && *pos) {
2071                 rc = listening_get_next(seq, rc);
2072                 --*pos;
2073         }
2074         return rc;
2075 }
2076
2077 static inline int empty_bucket(struct tcp_iter_state *st)
2078 {
2079         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2080                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2081 }
2082
2083 /*
2084  * Get first established socket starting from bucket given in st->bucket.
2085  * If st->bucket is zero, the very first socket in the hash is returned.
2086  */
2087 static void *established_get_first(struct seq_file *seq)
2088 {
2089         struct tcp_iter_state *st = seq->private;
2090         struct net *net = seq_file_net(seq);
2091         void *rc = NULL;
2092
2093         st->offset = 0;
2094         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2095                 struct sock *sk;
2096                 struct hlist_nulls_node *node;
2097                 struct inet_timewait_sock *tw;
2098                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2099
2100                 /* Lockless fast path for the common case of empty buckets */
2101                 if (empty_bucket(st))
2102                         continue;
2103
2104                 spin_lock_bh(lock);
2105                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2106                         if (sk->sk_family != st->family ||
2107                             !net_eq(sock_net(sk), net)) {
2108                                 continue;
2109                         }
2110                         rc = sk;
2111                         goto out;
2112                 }
2113                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2114                 inet_twsk_for_each(tw, node,
2115                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2116                         if (tw->tw_family != st->family ||
2117                             !net_eq(twsk_net(tw), net)) {
2118                                 continue;
2119                         }
2120                         rc = tw;
2121                         goto out;
2122                 }
2123                 spin_unlock_bh(lock);
2124                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2125         }
2126 out:
2127         return rc;
2128 }
2129
2130 static void *established_get_next(struct seq_file *seq, void *cur)
2131 {
2132         struct sock *sk = cur;
2133         struct inet_timewait_sock *tw;
2134         struct hlist_nulls_node *node;
2135         struct tcp_iter_state *st = seq->private;
2136         struct net *net = seq_file_net(seq);
2137
2138         ++st->num;
2139         ++st->offset;
2140
2141         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2142                 tw = cur;
2143                 tw = tw_next(tw);
2144 get_tw:
2145                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2146                         tw = tw_next(tw);
2147                 }
2148                 if (tw) {
2149                         cur = tw;
2150                         goto out;
2151                 }
2152                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2153                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2154
2155                 /* Look for next non empty bucket */
2156                 st->offset = 0;
2157                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2158                                 empty_bucket(st))
2159                         ;
2160                 if (st->bucket > tcp_hashinfo.ehash_mask)
2161                         return NULL;
2162
2163                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2164                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2165         } else
2166                 sk = sk_nulls_next(sk);
2167
2168         sk_nulls_for_each_from(sk, node) {
2169                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2170                         goto found;
2171         }
2172
2173         st->state = TCP_SEQ_STATE_TIME_WAIT;
2174         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2175         goto get_tw;
2176 found:
2177         cur = sk;
2178 out:
2179         return cur;
2180 }
2181
2182 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2183 {
2184         struct tcp_iter_state *st = seq->private;
2185         void *rc;
2186
2187         st->bucket = 0;
2188         rc = established_get_first(seq);
2189
2190         while (rc && pos) {
2191                 rc = established_get_next(seq, rc);
2192                 --pos;
2193         }
2194         return rc;
2195 }
2196
2197 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2198 {
2199         void *rc;
2200         struct tcp_iter_state *st = seq->private;
2201
2202         st->state = TCP_SEQ_STATE_LISTENING;
2203         rc        = listening_get_idx(seq, &pos);
2204
2205         if (!rc) {
2206                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2207                 rc        = established_get_idx(seq, pos);
2208         }
2209
2210         return rc;
2211 }
2212
2213 static void *tcp_seek_last_pos(struct seq_file *seq)
2214 {
2215         struct tcp_iter_state *st = seq->private;
2216         int offset = st->offset;
2217         int orig_num = st->num;
2218         void *rc = NULL;
2219
2220         switch (st->state) {
2221         case TCP_SEQ_STATE_OPENREQ:
2222         case TCP_SEQ_STATE_LISTENING:
2223                 if (st->bucket >= INET_LHTABLE_SIZE)
2224                         break;
2225                 st->state = TCP_SEQ_STATE_LISTENING;
2226                 rc = listening_get_next(seq, NULL);
2227                 while (offset-- && rc)
2228                         rc = listening_get_next(seq, rc);
2229                 if (rc)
2230                         break;
2231                 st->bucket = 0;
2232                 /* Fallthrough */
2233         case TCP_SEQ_STATE_ESTABLISHED:
2234         case TCP_SEQ_STATE_TIME_WAIT:
2235                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2236                 if (st->bucket > tcp_hashinfo.ehash_mask)
2237                         break;
2238                 rc = established_get_first(seq);
2239                 while (offset-- && rc)
2240                         rc = established_get_next(seq, rc);
2241         }
2242
2243         st->num = orig_num;
2244
2245         return rc;
2246 }
2247
2248 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2249 {
2250         struct tcp_iter_state *st = seq->private;
2251         void *rc;
2252
2253         if (*pos && *pos == st->last_pos) {
2254                 rc = tcp_seek_last_pos(seq);
2255                 if (rc)
2256                         goto out;
2257         }
2258
2259         st->state = TCP_SEQ_STATE_LISTENING;
2260         st->num = 0;
2261         st->bucket = 0;
2262         st->offset = 0;
2263         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2264
2265 out:
2266         st->last_pos = *pos;
2267         return rc;
2268 }
2269
2270 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2271 {
2272         struct tcp_iter_state *st = seq->private;
2273         void *rc = NULL;
2274
2275         if (v == SEQ_START_TOKEN) {
2276                 rc = tcp_get_idx(seq, 0);
2277                 goto out;
2278         }
2279
2280         switch (st->state) {
2281         case TCP_SEQ_STATE_OPENREQ:
2282         case TCP_SEQ_STATE_LISTENING:
2283                 rc = listening_get_next(seq, v);
2284                 if (!rc) {
2285                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2286                         st->bucket = 0;
2287                         st->offset = 0;
2288                         rc        = established_get_first(seq);
2289                 }
2290                 break;
2291         case TCP_SEQ_STATE_ESTABLISHED:
2292         case TCP_SEQ_STATE_TIME_WAIT:
2293                 rc = established_get_next(seq, v);
2294                 break;
2295         }
2296 out:
2297         ++*pos;
2298         st->last_pos = *pos;
2299         return rc;
2300 }
2301
2302 static void tcp_seq_stop(struct seq_file *seq, void *v)
2303 {
2304         struct tcp_iter_state *st = seq->private;
2305
2306         switch (st->state) {
2307         case TCP_SEQ_STATE_OPENREQ:
2308                 if (v) {
2309                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2310                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2311                 }
2312         case TCP_SEQ_STATE_LISTENING:
2313                 if (v != SEQ_START_TOKEN)
2314                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2315                 break;
2316         case TCP_SEQ_STATE_TIME_WAIT:
2317         case TCP_SEQ_STATE_ESTABLISHED:
2318                 if (v)
2319                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2320                 break;
2321         }
2322 }
2323
2324 static int tcp_seq_open(struct inode *inode, struct file *file)
2325 {
2326         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2327         struct tcp_iter_state *s;
2328         int err;
2329
2330         err = seq_open_net(inode, file, &afinfo->seq_ops,
2331                           sizeof(struct tcp_iter_state));
2332         if (err < 0)
2333                 return err;
2334
2335         s = ((struct seq_file *)file->private_data)->private;
2336         s->family               = afinfo->family;
2337         s->last_pos             = 0;
2338         return 0;
2339 }
2340
2341 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2342 {
2343         int rc = 0;
2344         struct proc_dir_entry *p;
2345
2346         afinfo->seq_fops.open           = tcp_seq_open;
2347         afinfo->seq_fops.read           = seq_read;
2348         afinfo->seq_fops.llseek         = seq_lseek;
2349         afinfo->seq_fops.release        = seq_release_net;
2350
2351         afinfo->seq_ops.start           = tcp_seq_start;
2352         afinfo->seq_ops.next            = tcp_seq_next;
2353         afinfo->seq_ops.stop            = tcp_seq_stop;
2354
2355         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2356                              &afinfo->seq_fops, afinfo);
2357         if (!p)
2358                 rc = -ENOMEM;
2359         return rc;
2360 }
2361 EXPORT_SYMBOL(tcp_proc_register);
2362
2363 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2364 {
2365         proc_net_remove(net, afinfo->name);
2366 }
2367 EXPORT_SYMBOL(tcp_proc_unregister);
2368
2369 static void get_openreq4(struct sock *sk, struct request_sock *req,
2370                          struct seq_file *f, int i, int uid, int *len)
2371 {
2372         const struct inet_request_sock *ireq = inet_rsk(req);
2373         int ttd = req->expires - jiffies;
2374
2375         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2376                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2377                 i,
2378                 ireq->loc_addr,
2379                 ntohs(inet_sk(sk)->inet_sport),
2380                 ireq->rmt_addr,
2381                 ntohs(ireq->rmt_port),
2382                 TCP_SYN_RECV,
2383                 0, 0, /* could print option size, but that is af dependent. */
2384                 1,    /* timers active (only the expire timer) */
2385                 jiffies_to_clock_t(ttd),
2386                 req->retrans,
2387                 uid,
2388                 0,  /* non standard timer */
2389                 0, /* open_requests have no inode */
2390                 atomic_read(&sk->sk_refcnt),
2391                 req,
2392                 len);
2393 }
2394
2395 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2396 {
2397         int timer_active;
2398         unsigned long timer_expires;
2399         struct tcp_sock *tp = tcp_sk(sk);
2400         const struct inet_connection_sock *icsk = inet_csk(sk);
2401         struct inet_sock *inet = inet_sk(sk);
2402         __be32 dest = inet->inet_daddr;
2403         __be32 src = inet->inet_rcv_saddr;
2404         __u16 destp = ntohs(inet->inet_dport);
2405         __u16 srcp = ntohs(inet->inet_sport);
2406         int rx_queue;
2407
2408         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2409                 timer_active    = 1;
2410                 timer_expires   = icsk->icsk_timeout;
2411         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2412                 timer_active    = 4;
2413                 timer_expires   = icsk->icsk_timeout;
2414         } else if (timer_pending(&sk->sk_timer)) {
2415                 timer_active    = 2;
2416                 timer_expires   = sk->sk_timer.expires;
2417         } else {
2418                 timer_active    = 0;
2419                 timer_expires = jiffies;
2420         }
2421
2422         if (sk->sk_state == TCP_LISTEN)
2423                 rx_queue = sk->sk_ack_backlog;
2424         else
2425                 /*
2426                  * because we dont lock socket, we might find a transient negative value
2427                  */
2428                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2429
2430         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2431                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2432                 i, src, srcp, dest, destp, sk->sk_state,
2433                 tp->write_seq - tp->snd_una,
2434                 rx_queue,
2435                 timer_active,
2436                 jiffies_to_clock_t(timer_expires - jiffies),
2437                 icsk->icsk_retransmits,
2438                 sock_i_uid(sk),
2439                 icsk->icsk_probes_out,
2440                 sock_i_ino(sk),
2441                 atomic_read(&sk->sk_refcnt), sk,
2442                 jiffies_to_clock_t(icsk->icsk_rto),
2443                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2444                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2445                 tp->snd_cwnd,
2446                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2447                 len);
2448 }
2449
2450 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2451                                struct seq_file *f, int i, int *len)
2452 {
2453         __be32 dest, src;
2454         __u16 destp, srcp;
2455         int ttd = tw->tw_ttd - jiffies;
2456
2457         if (ttd < 0)
2458                 ttd = 0;
2459
2460         dest  = tw->tw_daddr;
2461         src   = tw->tw_rcv_saddr;
2462         destp = ntohs(tw->tw_dport);
2463         srcp  = ntohs(tw->tw_sport);
2464
2465         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2466                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2467                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2468                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2469                 atomic_read(&tw->tw_refcnt), tw, len);
2470 }
2471
2472 #define TMPSZ 150
2473
2474 static int tcp4_seq_show(struct seq_file *seq, void *v)
2475 {
2476         struct tcp_iter_state *st;
2477         int len;
2478
2479         if (v == SEQ_START_TOKEN) {
2480                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2481                            "  sl  local_address rem_address   st tx_queue "
2482                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2483                            "inode");
2484                 goto out;
2485         }
2486         st = seq->private;
2487
2488         switch (st->state) {
2489         case TCP_SEQ_STATE_LISTENING:
2490         case TCP_SEQ_STATE_ESTABLISHED:
2491                 get_tcp4_sock(v, seq, st->num, &len);
2492                 break;
2493         case TCP_SEQ_STATE_OPENREQ:
2494                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2495                 break;
2496         case TCP_SEQ_STATE_TIME_WAIT:
2497                 get_timewait4_sock(v, seq, st->num, &len);
2498                 break;
2499         }
2500         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2501 out:
2502         return 0;
2503 }
2504
2505 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2506         .name           = "tcp",
2507         .family         = AF_INET,
2508         .seq_fops       = {
2509                 .owner          = THIS_MODULE,
2510         },
2511         .seq_ops        = {
2512                 .show           = tcp4_seq_show,
2513         },
2514 };
2515
2516 static int __net_init tcp4_proc_init_net(struct net *net)
2517 {
2518         return tcp_proc_register(net, &tcp4_seq_afinfo);
2519 }
2520
2521 static void __net_exit tcp4_proc_exit_net(struct net *net)
2522 {
2523         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2524 }
2525
2526 static struct pernet_operations tcp4_net_ops = {
2527         .init = tcp4_proc_init_net,
2528         .exit = tcp4_proc_exit_net,
2529 };
2530
2531 int __init tcp4_proc_init(void)
2532 {
2533         return register_pernet_subsys(&tcp4_net_ops);
2534 }
2535
2536 void tcp4_proc_exit(void)
2537 {
2538         unregister_pernet_subsys(&tcp4_net_ops);
2539 }
2540 #endif /* CONFIG_PROC_FS */
2541
2542 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2543 {
2544         const struct iphdr *iph = skb_gro_network_header(skb);
2545
2546         switch (skb->ip_summed) {
2547         case CHECKSUM_COMPLETE:
2548                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2549                                   skb->csum)) {
2550                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2551                         break;
2552                 }
2553
2554                 /* fall through */
2555         case CHECKSUM_NONE:
2556                 NAPI_GRO_CB(skb)->flush = 1;
2557                 return NULL;
2558         }
2559
2560         return tcp_gro_receive(head, skb);
2561 }
2562
2563 int tcp4_gro_complete(struct sk_buff *skb)
2564 {
2565         const struct iphdr *iph = ip_hdr(skb);
2566         struct tcphdr *th = tcp_hdr(skb);
2567
2568         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2569                                   iph->saddr, iph->daddr, 0);
2570         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2571
2572         return tcp_gro_complete(skb);
2573 }
2574
2575 struct proto tcp_prot = {
2576         .name                   = "TCP",
2577         .owner                  = THIS_MODULE,
2578         .close                  = tcp_close,
2579         .connect                = tcp_v4_connect,
2580         .disconnect             = tcp_disconnect,
2581         .accept                 = inet_csk_accept,
2582         .ioctl                  = tcp_ioctl,
2583         .init                   = tcp_v4_init_sock,
2584         .destroy                = tcp_v4_destroy_sock,
2585         .shutdown               = tcp_shutdown,
2586         .setsockopt             = tcp_setsockopt,
2587         .getsockopt             = tcp_getsockopt,
2588         .recvmsg                = tcp_recvmsg,
2589         .sendmsg                = tcp_sendmsg,
2590         .sendpage               = tcp_sendpage,
2591         .backlog_rcv            = tcp_v4_do_rcv,
2592         .hash                   = inet_hash,
2593         .unhash                 = inet_unhash,
2594         .get_port               = inet_csk_get_port,
2595         .enter_memory_pressure  = tcp_enter_memory_pressure,
2596         .sockets_allocated      = &tcp_sockets_allocated,
2597         .orphan_count           = &tcp_orphan_count,
2598         .memory_allocated       = &tcp_memory_allocated,
2599         .memory_pressure        = &tcp_memory_pressure,
2600         .sysctl_mem             = sysctl_tcp_mem,
2601         .sysctl_wmem            = sysctl_tcp_wmem,
2602         .sysctl_rmem            = sysctl_tcp_rmem,
2603         .max_header             = MAX_TCP_HEADER,
2604         .obj_size               = sizeof(struct tcp_sock),
2605         .slab_flags             = SLAB_DESTROY_BY_RCU,
2606         .twsk_prot              = &tcp_timewait_sock_ops,
2607         .rsk_prot               = &tcp_request_sock_ops,
2608         .h.hashinfo             = &tcp_hashinfo,
2609         .no_autobind            = true,
2610 #ifdef CONFIG_COMPAT
2611         .compat_setsockopt      = compat_tcp_setsockopt,
2612         .compat_getsockopt      = compat_tcp_getsockopt,
2613 #endif
2614 };
2615 EXPORT_SYMBOL(tcp_prot);
2616
2617
2618 static int __net_init tcp_sk_init(struct net *net)
2619 {
2620         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2621                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2622 }
2623
2624 static void __net_exit tcp_sk_exit(struct net *net)
2625 {
2626         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2627 }
2628
2629 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2630 {
2631         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2632 }
2633
2634 static struct pernet_operations __net_initdata tcp_sk_ops = {
2635        .init       = tcp_sk_init,
2636        .exit       = tcp_sk_exit,
2637        .exit_batch = tcp_sk_exit_batch,
2638 };
2639
2640 void __init tcp_v4_init(void)
2641 {
2642         inet_hashinfo_init(&tcp_hashinfo);
2643         if (register_pernet_subsys(&tcp_sk_ops))
2644                 panic("Failed to create the TCP control socket.\n");
2645 }