net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75 #include <net/secure_seq.h>
  76
  77 #include <linux/inet.h>
  78 #include <linux/ipv6.h>
  79 #include <linux/stddef.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/seq_file.h>
  82
  83 #include <linux/crypto.h>
  84 #include <linux/scatterlist.h>
  85
  86 int sysctl_tcp_tw_reuse __read_mostly;
  87 int sysctl_tcp_low_latency __read_mostly;
  88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  89
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  96 #else
  97 static inline
  98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  99 {
 100         return NULL;
 101 }
 102 #endif
 103
 104 struct inet_hashinfo tcp_hashinfo;
 105 EXPORT_SYMBOL(tcp_hashinfo);
 106
 107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 108 {
 109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 110                                           ip_hdr(skb)->saddr,
 111                                           tcp_hdr(skb)->dest,
 112                                           tcp_hdr(skb)->source);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 118         struct tcp_sock *tp = tcp_sk(sk);
 119
 120         /* With PAWS, it is safe from the viewpoint
 121            of data integrity. Even without PAWS it is safe provided sequence
 122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 123
 124            Actually, the idea is close to VJ's one, only timestamp cache is
 125            held not per host, but per port pair and TW bucket is used as state
 126            holder.
 127
 128            If TW bucket has been already destroyed we fall back to VJ's scheme
 129            and use initial timestamp retrieved from peer table.
 130          */
 131         if (tcptw->tw_ts_recent_stamp &&
 132             (twp == NULL || (sysctl_tcp_tw_reuse &&
 133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 135                 if (tp->write_seq == 0)
 136                         tp->write_seq = 1;
 137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 139                 sock_hold(sktw);
 140                 return 1;
 141         }
 142
 143         return 0;
 144 }
 145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147 /* This will initiate an outgoing connection. */
 148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149 {
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct inet_sock *inet = inet_sk(sk);
 152         struct tcp_sock *tp = tcp_sk(sk);
 153         __be16 orig_sport, orig_dport;
 154         __be32 daddr, nexthop;
 155         struct flowi4 *fl4;
 156         struct rtable *rt;
 157         int err;
 158         struct ip_options_rcu *inet_opt;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         inet_opt = rcu_dereference_protected(inet->inet_opt,
 168                                              sock_owned_by_user(sk));
 169         if (inet_opt && inet_opt->opt.srr) {
 170                 if (!daddr)
 171                         return -EINVAL;
 172                 nexthop = inet_opt->opt.faddr;
 173         }
 174
 175         orig_sport = inet->inet_sport;
 176         orig_dport = usin->sin_port;
 177         fl4 = &inet->cork.fl.u.ip4;
 178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                               IPPROTO_TCP,
 181                               orig_sport, orig_dport, sk, true);
 182         if (IS_ERR(rt)) {
 183                 err = PTR_ERR(rt);
 184                 if (err == -ENETUNREACH)
 185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 186                 return err;
 187         }
 188
 189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 190                 ip_rt_put(rt);
 191                 return -ENETUNREACH;
 192         }
 193
 194         if (!inet_opt || !inet_opt->opt.srr)
 195                 daddr = fl4->daddr;
 196
 197         if (!inet->inet_saddr)
 198                 inet->inet_saddr = fl4->saddr;
 199         inet->inet_rcv_saddr = inet->inet_saddr;
 200
 201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 202                 /* Reset inherited state */
 203                 tp->rx_opt.ts_recent       = 0;
 204                 tp->rx_opt.ts_recent_stamp = 0;
 205                 tp->write_seq              = 0;
 206         }
 207
 208         if (tcp_death_row.sysctl_tw_recycle &&
 209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 211                 /*
 212                  * VJ's idea. We save last timestamp seen from
 213                  * the destination in peer table, when entering state
 214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 215                  * when trying new connection.
 216                  */
 217                 if (peer) {
 218                         inet_peer_refcheck(peer);
 219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 222                         }
 223                 }
 224         }
 225
 226         inet->inet_dport = usin->sin_port;
 227         inet->inet_daddr = daddr;
 228
 229         inet_csk(sk)->icsk_ext_hdr_len = 0;
 230         if (inet_opt)
 231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 232
 233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 234
 235         /* Socket identity is still unknown (sport may be zero).
 236          * However we set state to SYN-SENT and not releasing socket
 237          * lock select source port, enter ourselves into the hash tables and
 238          * complete initialization after this.
 239          */
 240         tcp_set_state(sk, TCP_SYN_SENT);
 241         err = inet_hash_connect(&tcp_death_row, sk);
 242         if (err)
 243                 goto failure;
 244
 245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 246                                inet->inet_sport, inet->inet_dport, sk);
 247         if (IS_ERR(rt)) {
 248                 err = PTR_ERR(rt);
 249                 rt = NULL;
 250                 goto failure;
 251         }
 252         /* OK, now commit destination to socket.  */
 253         sk->sk_gso_type = SKB_GSO_TCPV4;
 254         sk_setup_caps(sk, &rt->dst);
 255
 256         if (!tp->write_seq)
 257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 258                                                            inet->inet_daddr,
 259                                                            inet->inet_sport,
 260                                                            usin->sin_port);
 261
 262         inet->inet_id = tp->write_seq ^ jiffies;
 263
 264         err = tcp_connect(sk);
 265         rt = NULL;
 266         if (err)
 267                 goto failure;
 268
 269         return 0;
 270
 271 failure:
 272         /*
 273          * This unhashes the socket and releases the local port,
 274          * if necessary.
 275          */
 276         tcp_set_state(sk, TCP_CLOSE);
 277         ip_rt_put(rt);
 278         sk->sk_route_caps = 0;
 279         inet->inet_dport = 0;
 280         return err;
 281 }
 282 EXPORT_SYMBOL(tcp_v4_connect);
 283
 284 /*
 285  * This routine does path mtu discovery as defined in RFC1191.
 286  */
 287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 288 {
 289         struct dst_entry *dst;
 290         struct inet_sock *inet = inet_sk(sk);
 291
 292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 293          * send out by Linux are always <576bytes so they should go through
 294          * unfragmented).
 295          */
 296         if (sk->sk_state == TCP_LISTEN)
 297                 return;
 298
 299         /* We don't check in the destentry if pmtu discovery is forbidden
 300          * on this route. We just assume that no packet_to_big packets
 301          * are send back when pmtu discovery is not active.
 302          * There is a small race when the user changes this flag in the
 303          * route, but I think that's acceptable.
 304          */
 305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 306                 return;
 307
 308         dst->ops->update_pmtu(dst, mtu);
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 320                 tcp_sync_mss(sk, mtu);
 321
 322                 /* Resend the TCP packet because it's
 323                  * clear that the old packet has been
 324                  * dropped. This is the new "fast" path mtu
 325                  * discovery.
 326                  */
 327                 tcp_simple_retransmit(sk);
 328         } /* else let the usual retransmit timer handle it */
 329 }
 330
 331 /*
 332  * This routine is called by the ICMP module when it gets some
 333  * sort of error condition.  If err < 0 then the socket should
 334  * be closed and the error returned to the user.  If err > 0
 335  * it's just the icmp type << 8 | icmp code.  After adjustment
 336  * header points to the first 8 bytes of the tcp header.  We need
 337  * to find the appropriate port.
 338  *
 339  * The locking strategy used here is very "optimistic". When
 340  * someone else accesses the socket the ICMP is just dropped
 341  * and for some paths there is no check at all.
 342  * A more general error queue to queue errors for later handling
 343  * is probably better.
 344  *
 345  */
 346
 347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 348 {
 349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 351         struct inet_connection_sock *icsk;
 352         struct tcp_sock *tp;
 353         struct inet_sock *inet;
 354         const int type = icmp_hdr(icmp_skb)->type;
 355         const int code = icmp_hdr(icmp_skb)->code;
 356         struct sock *sk;
 357         struct sk_buff *skb;
 358         __u32 seq;
 359         __u32 remaining;
 360         int err;
 361         struct net *net = dev_net(icmp_skb->dev);
 362
 363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367
 368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 369                         iph->saddr, th->source, inet_iif(icmp_skb));
 370         if (!sk) {
 371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 372                 return;
 373         }
 374         if (sk->sk_state == TCP_TIME_WAIT) {
 375                 inet_twsk_put(inet_twsk(sk));
 376                 return;
 377         }
 378
 379         bh_lock_sock(sk);
 380         /* If too many ICMPs get dropped on busy
 381          * servers this needs to be solved differently.
 382          */
 383         if (sock_owned_by_user(sk))
 384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 385
 386         if (sk->sk_state == TCP_CLOSE)
 387                 goto out;
 388
 389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 391                 goto out;
 392         }
 393
 394         icsk = inet_csk(sk);
 395         tp = tcp_sk(sk);
 396         seq = ntohl(th->seq);
 397         if (sk->sk_state != TCP_LISTEN &&
 398             !between(seq, tp->snd_una, tp->snd_nxt)) {
 399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 400                 goto out;
 401         }
 402
 403         switch (type) {
 404         case ICMP_SOURCE_QUENCH:
 405                 /* Just silently ignore these. */
 406                 goto out;
 407         case ICMP_PARAMETERPROB:
 408                 err = EPROTO;
 409                 break;
 410         case ICMP_DEST_UNREACH:
 411                 if (code > NR_ICMP_UNREACH)
 412                         goto out;
 413
 414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 415                         if (!sock_owned_by_user(sk))
 416                                 do_pmtu_discovery(sk, iph, info);
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 434                                          icsk->icsk_backoff;
 435                 tcp_bound_rto(sk);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 442
 443                 if (remaining) {
 444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                   remaining, TCP_RTO_MAX);
 446                 } else {
 447                         /* RTO revert clocked out retransmission.
 448                          * Will retransmit now */
 449                         tcp_retransmit_timer(sk);
 450                 }
 451
 452                 break;
 453         case ICMP_TIME_EXCEEDED:
 454                 err = EHOSTUNREACH;
 455                 break;
 456         default:
 457                 goto out;
 458         }
 459
 460         switch (sk->sk_state) {
 461                 struct request_sock *req, **prev;
 462         case TCP_LISTEN:
 463                 if (sock_owned_by_user(sk))
 464                         goto out;
 465
 466                 req = inet_csk_search_req(sk, &prev, th->dest,
 467                                           iph->daddr, iph->saddr);
 468                 if (!req)
 469                         goto out;
 470
 471                 /* ICMPs are not backlogged, hence we cannot get
 472                    an established socket here.
 473                  */
 474                 WARN_ON(req->sk);
 475
 476                 if (seq != tcp_rsk(req)->snt_isn) {
 477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                         goto out;
 479                 }
 480
 481                 /*
 482                  * Still in SYN_RECV, just remove it silently.
 483                  * There is no good way to pass the error to the newly
 484                  * created socket, and POSIX does not want network
 485                  * errors returned from accept().
 486                  */
 487                 inet_csk_reqsk_queue_drop(sk, req, prev);
 488                 goto out;
 489
 490         case TCP_SYN_SENT:
 491         case TCP_SYN_RECV:  /* Cannot happen.
 492                                It can f.e. if SYNs crossed.
 493                              */
 494                 if (!sock_owned_by_user(sk)) {
 495                         sk->sk_err = err;
 496
 497                         sk->sk_error_report(sk);
 498
 499                         tcp_done(sk);
 500                 } else {
 501                         sk->sk_err_soft = err;
 502                 }
 503                 goto out;
 504         }
 505
 506         /* If we've already connected we will keep trying
 507          * until we time out, or the user gives up.
 508          *
 509          * rfc1122 4.2.3.9 allows to consider as hard errors
 510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 511          * but it is obsoleted by pmtu discovery).
 512          *
 513          * Note, that in modern internet, where routing is unreliable
 514          * and in each dark corner broken firewalls sit, sending random
 515          * errors ordered by their masters even this two messages finally lose
 516          * their original sense (even Linux sends invalid PORT_UNREACHs)
 517          *
 518          * Now we are in compliance with RFCs.
 519          *                                                      --ANK (980905)
 520          */
 521
 522         inet = inet_sk(sk);
 523         if (!sock_owned_by_user(sk) && inet->recverr) {
 524                 sk->sk_err = err;
 525                 sk->sk_error_report(sk);
 526         } else  { /* Only an error on timeout */
 527                 sk->sk_err_soft = err;
 528         }
 529
 530 out:
 531         bh_unlock_sock(sk);
 532         sock_put(sk);
 533 }
 534
 535 static void __tcp_v4_send_check(struct sk_buff *skb,
 536                                 __be32 saddr, __be32 daddr)
 537 {
 538         struct tcphdr *th = tcp_hdr(skb);
 539
 540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 542                 skb->csum_start = skb_transport_header(skb) - skb->head;
 543                 skb->csum_offset = offsetof(struct tcphdr, check);
 544         } else {
 545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 546                                          csum_partial(th,
 547                                                       th->doff << 2,
 548                                                       skb->csum));
 549         }
 550 }
 551
 552 /* This routine computes an IPv4 TCP checksum. */
 553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556
 557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 558 }
 559 EXPORT_SYMBOL(tcp_v4_send_check);
 560
 561 int tcp_v4_gso_send_check(struct sk_buff *skb)
 562 {
 563         const struct iphdr *iph;
 564         struct tcphdr *th;
 565
 566         if (!pskb_may_pull(skb, sizeof(*th)))
 567                 return -EINVAL;
 568
 569         iph = ip_hdr(skb);
 570         th = tcp_hdr(skb);
 571
 572         th->check = 0;
 573         skb->ip_summed = CHECKSUM_PARTIAL;
 574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 575         return 0;
 576 }
 577
 578 /*
 579  *      This routine will send an RST to the other tcp.
 580  *
 581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 582  *                    for reset.
 583  *      Answer: if a packet caused RST, it is not for a socket
 584  *              existing in our system, if it is matched to a socket,
 585  *              it is just duplicate segment or bug in other side's TCP.
 586  *              So that we build reply only basing on parameters
 587  *              arrived with segment.
 588  *      Exception: precedence violation. We do not implement it in any case.
 589  */
 590
 591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 592 {
 593         struct tcphdr *th = tcp_hdr(skb);
 594         struct {
 595                 struct tcphdr th;
 596 #ifdef CONFIG_TCP_MD5SIG
 597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 598 #endif
 599         } rep;
 600         struct ip_reply_arg arg;
 601 #ifdef CONFIG_TCP_MD5SIG
 602         struct tcp_md5sig_key *key;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 611                 return;
 612
 613         /* Swap the send and the receive. */
 614         memset(&rep, 0, sizeof(rep));
 615         rep.th.dest   = th->source;
 616         rep.th.source = th->dest;
 617         rep.th.doff   = sizeof(struct tcphdr) / 4;
 618         rep.th.rst    = 1;
 619
 620         if (th->ack) {
 621                 rep.th.seq = th->ack_seq;
 622         } else {
 623                 rep.th.ack = 1;
 624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 625                                        skb->len - (th->doff << 2));
 626         }
 627
 628         memset(&arg, 0, sizeof(arg));
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631
 632 #ifdef CONFIG_TCP_MD5SIG
 633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
 634         if (key) {
 635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 636                                    (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_MD5SIG << 8) |
 638                                    TCPOLEN_MD5SIG);
 639                 /* Update length and the length the header thinks exists */
 640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 641                 rep.th.doff = arg.iov[0].iov_len / 4;
 642
 643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 644                                      key, ip_hdr(skb)->saddr,
 645                                      ip_hdr(skb)->daddr, &rep.th);
 646         }
 647 #endif
 648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 649                                       ip_hdr(skb)->saddr, /* XXX */
 650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 653         /* When socket is gone, all binding information is lost.
 654          * routing might fail in this case. using iif for oif to
 655          * make sure we can deliver it
 656          */
 657         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 658
 659         net = dev_net(skb_dst(skb)->dev);
 660         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 661                       &arg, arg.iov[0].iov_len);
 662
 663         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 664         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 665 }
 666
 667 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 668    outside socket context is ugly, certainly. What can I do?
 669  */
 670
 671 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 672                             u32 win, u32 ts, int oif,
 673                             struct tcp_md5sig_key *key,
 674                             int reply_flags)
 675 {
 676         struct tcphdr *th = tcp_hdr(skb);
 677         struct {
 678                 struct tcphdr th;
 679                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 680 #ifdef CONFIG_TCP_MD5SIG
 681                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 682 #endif
 683                         ];
 684         } rep;
 685         struct ip_reply_arg arg;
 686         struct net *net = dev_net(skb_dst(skb)->dev);
 687
 688         memset(&rep.th, 0, sizeof(struct tcphdr));
 689         memset(&arg, 0, sizeof(arg));
 690
 691         arg.iov[0].iov_base = (unsigned char *)&rep;
 692         arg.iov[0].iov_len  = sizeof(rep.th);
 693         if (ts) {
 694                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 695                                    (TCPOPT_TIMESTAMP << 8) |
 696                                    TCPOLEN_TIMESTAMP);
 697                 rep.opt[1] = htonl(tcp_time_stamp);
 698                 rep.opt[2] = htonl(ts);
 699                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 700         }
 701
 702         /* Swap the send and the receive. */
 703         rep.th.dest    = th->source;
 704         rep.th.source  = th->dest;
 705         rep.th.doff    = arg.iov[0].iov_len / 4;
 706         rep.th.seq     = htonl(seq);
 707         rep.th.ack_seq = htonl(ack);
 708         rep.th.ack     = 1;
 709         rep.th.window  = htons(win);
 710
 711 #ifdef CONFIG_TCP_MD5SIG
 712         if (key) {
 713                 int offset = (ts) ? 3 : 0;
 714
 715                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 716                                           (TCPOPT_NOP << 16) |
 717                                           (TCPOPT_MD5SIG << 8) |
 718                                           TCPOLEN_MD5SIG);
 719                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 720                 rep.th.doff = arg.iov[0].iov_len/4;
 721
 722                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 723                                     key, ip_hdr(skb)->saddr,
 724                                     ip_hdr(skb)->daddr, &rep.th);
 725         }
 726 #endif
 727         arg.flags = reply_flags;
 728         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 729                                       ip_hdr(skb)->saddr, /* XXX */
 730                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 731         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 732         if (oif)
 733                 arg.bound_dev_if = oif;
 734
 735         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 736                       &arg, arg.iov[0].iov_len);
 737
 738         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 739 }
 740
 741 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 742 {
 743         struct inet_timewait_sock *tw = inet_twsk(sk);
 744         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 745
 746         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 747                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 748                         tcptw->tw_ts_recent,
 749                         tw->tw_bound_dev_if,
 750                         tcp_twsk_md5_key(tcptw),
 751                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 752                         );
 753
 754         inet_twsk_put(tw);
 755 }
 756
 757 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 758                                   struct request_sock *req)
 759 {
 760         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 761                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 762                         req->ts_recent,
 763                         0,
 764                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 765                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 766 }
 767
 768 /*
 769  *      Send a SYN-ACK after having received a SYN.
 770  *      This still operates on a request_sock only, not on a big
 771  *      socket.
 772  */
 773 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 774                               struct request_sock *req,
 775                               struct request_values *rvp)
 776 {
 777         const struct inet_request_sock *ireq = inet_rsk(req);
 778         struct flowi4 fl4;
 779         int err = -1;
 780         struct sk_buff * skb;
 781
 782         /* First, grab a route. */
 783         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 784                 return -1;
 785
 786         skb = tcp_make_synack(sk, dst, req, rvp);
 787
 788         if (skb) {
 789                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 790
 791                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 792                                             ireq->rmt_addr,
 793                                             ireq->opt);
 794                 err = net_xmit_eval(err);
 795         }
 796
 797         dst_release(dst);
 798         return err;
 799 }
 800
 801 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 802                               struct request_values *rvp)
 803 {
 804         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 805         return tcp_v4_send_synack(sk, NULL, req, rvp);
 806 }
 807
 808 /*
 809  *      IPv4 request_sock destructor.
 810  */
 811 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 812 {
 813         kfree(inet_rsk(req)->opt);
 814 }
 815
 816 static void syn_flood_warning(const struct sk_buff *skb)
 817 {
 818         const char *msg;
 819
 820 #ifdef CONFIG_SYN_COOKIES
 821         if (sysctl_tcp_syncookies)
 822                 msg = "Sending cookies";
 823         else
 824 #endif
 825                 msg = "Dropping request";
 826
 827         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 828                                 ntohs(tcp_hdr(skb)->dest), msg);
 829 }
 830
 831 /*
 832  * Save and compile IPv4 options into the request_sock if needed.
 833  */
 834 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 835                                                   struct sk_buff *skb)
 836 {
 837         const struct ip_options *opt = &(IPCB(skb)->opt);
 838         struct ip_options_rcu *dopt = NULL;
 839
 840         if (opt && opt->optlen) {
 841                 int opt_size = sizeof(*dopt) + opt->optlen;
 842
 843                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 844                 if (dopt) {
 845                         if (ip_options_echo(&dopt->opt, skb)) {
 846                                 kfree(dopt);
 847                                 dopt = NULL;
 848                         }
 849                 }
 850         }
 851         return dopt;
 852 }
 853
 854 #ifdef CONFIG_TCP_MD5SIG
 855 /*
 856  * RFC2385 MD5 checksumming requires a mapping of
 857  * IP address->MD5 Key.
 858  * We need to maintain these in the sk structure.
 859  */
 860
 861 /* Find the Key structure for an address.  */
 862 static struct tcp_md5sig_key *
 863                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 864 {
 865         struct tcp_sock *tp = tcp_sk(sk);
 866         int i;
 867
 868         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 869                 return NULL;
 870         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 871                 if (tp->md5sig_info->keys4[i].addr == addr)
 872                         return &tp->md5sig_info->keys4[i].base;
 873         }
 874         return NULL;
 875 }
 876
 877 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 878                                          struct sock *addr_sk)
 879 {
 880         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 881 }
 882 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 883
 884 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 885                                                       struct request_sock *req)
 886 {
 887         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 888 }
 889
 890 /* This can be called on a newly created socket, from other files */
 891 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 892                       u8 *newkey, u8 newkeylen)
 893 {
 894         /* Add Key to the list */
 895         struct tcp_md5sig_key *key;
 896         struct tcp_sock *tp = tcp_sk(sk);
 897         struct tcp4_md5sig_key *keys;
 898
 899         key = tcp_v4_md5_do_lookup(sk, addr);
 900         if (key) {
 901                 /* Pre-existing entry - just update that one. */
 902                 kfree(key->key);
 903                 key->key = newkey;
 904                 key->keylen = newkeylen;
 905         } else {
 906                 struct tcp_md5sig_info *md5sig;
 907
 908                 if (!tp->md5sig_info) {
 909                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 910                                                   GFP_ATOMIC);
 911                         if (!tp->md5sig_info) {
 912                                 kfree(newkey);
 913                                 return -ENOMEM;
 914                         }
 915                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 916                 }
 917
 918                 md5sig = tp->md5sig_info;
 919                 if (md5sig->entries4 == 0 &&
 920                     tcp_alloc_md5sig_pool(sk) == NULL) {
 921                         kfree(newkey);
 922                         return -ENOMEM;
 923                 }
 924
 925                 if (md5sig->alloced4 == md5sig->entries4) {
 926                         keys = kmalloc((sizeof(*keys) *
 927                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 928                         if (!keys) {
 929                                 kfree(newkey);
 930                                 if (md5sig->entries4 == 0)
 931                                         tcp_free_md5sig_pool();
 932                                 return -ENOMEM;
 933                         }
 934
 935                         if (md5sig->entries4)
 936                                 memcpy(keys, md5sig->keys4,
 937                                        sizeof(*keys) * md5sig->entries4);
 938
 939                         /* Free old key list, and reference new one */
 940                         kfree(md5sig->keys4);
 941                         md5sig->keys4 = keys;
 942                         md5sig->alloced4++;
 943                 }
 944                 md5sig->entries4++;
 945                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 946                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 947                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 948         }
 949         return 0;
 950 }
 951 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 952
 953 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 954                                u8 *newkey, u8 newkeylen)
 955 {
 956         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 957                                  newkey, newkeylen);
 958 }
 959
 960 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 961 {
 962         struct tcp_sock *tp = tcp_sk(sk);
 963         int i;
 964
 965         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 966                 if (tp->md5sig_info->keys4[i].addr == addr) {
 967                         /* Free the key */
 968                         kfree(tp->md5sig_info->keys4[i].base.key);
 969                         tp->md5sig_info->entries4--;
 970
 971                         if (tp->md5sig_info->entries4 == 0) {
 972                                 kfree(tp->md5sig_info->keys4);
 973                                 tp->md5sig_info->keys4 = NULL;
 974                                 tp->md5sig_info->alloced4 = 0;
 975                                 tcp_free_md5sig_pool();
 976                         } else if (tp->md5sig_info->entries4 != i) {
 977                                 /* Need to do some manipulation */
 978                                 memmove(&tp->md5sig_info->keys4[i],
 979                                         &tp->md5sig_info->keys4[i+1],
 980                                         (tp->md5sig_info->entries4 - i) *
 981                                          sizeof(struct tcp4_md5sig_key));
 982                         }
 983                         return 0;
 984                 }
 985         }
 986         return -ENOENT;
 987 }
 988 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 989
 990 static void tcp_v4_clear_md5_list(struct sock *sk)
 991 {
 992         struct tcp_sock *tp = tcp_sk(sk);
 993
 994         /* Free each key, then the set of key keys,
 995          * the crypto element, and then decrement our
 996          * hold on the last resort crypto.
 997          */
 998         if (tp->md5sig_info->entries4) {
 999                 int i;
1000                 for (i = 0; i < tp->md5sig_info->entries4; i++)
1001                         kfree(tp->md5sig_info->keys4[i].base.key);
1002                 tp->md5sig_info->entries4 = 0;
1003                 tcp_free_md5sig_pool();
1004         }
1005         if (tp->md5sig_info->keys4) {
1006                 kfree(tp->md5sig_info->keys4);
1007                 tp->md5sig_info->keys4 = NULL;
1008                 tp->md5sig_info->alloced4  = 0;
1009         }
1010 }
1011
1012 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1013                                  int optlen)
1014 {
1015         struct tcp_md5sig cmd;
1016         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1017         u8 *newkey;
1018
1019         if (optlen < sizeof(cmd))
1020                 return -EINVAL;
1021
1022         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1023                 return -EFAULT;
1024
1025         if (sin->sin_family != AF_INET)
1026                 return -EINVAL;
1027
1028         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1029                 if (!tcp_sk(sk)->md5sig_info)
1030                         return -ENOENT;
1031                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1032         }
1033
1034         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1035                 return -EINVAL;
1036
1037         if (!tcp_sk(sk)->md5sig_info) {
1038                 struct tcp_sock *tp = tcp_sk(sk);
1039                 struct tcp_md5sig_info *p;
1040
1041                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1042                 if (!p)
1043                         return -EINVAL;
1044
1045                 tp->md5sig_info = p;
1046                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1047         }
1048
1049         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1050         if (!newkey)
1051                 return -ENOMEM;
1052         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1053                                  newkey, cmd.tcpm_keylen);
1054 }
1055
1056 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1057                                         __be32 daddr, __be32 saddr, int nbytes)
1058 {
1059         struct tcp4_pseudohdr *bp;
1060         struct scatterlist sg;
1061
1062         bp = &hp->md5_blk.ip4;
1063
1064         /*
1065          * 1. the TCP pseudo-header (in the order: source IP address,
1066          * destination IP address, zero-padded protocol number, and
1067          * segment length)
1068          */
1069         bp->saddr = saddr;
1070         bp->daddr = daddr;
1071         bp->pad = 0;
1072         bp->protocol = IPPROTO_TCP;
1073         bp->len = cpu_to_be16(nbytes);
1074
1075         sg_init_one(&sg, bp, sizeof(*bp));
1076         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1077 }
1078
1079 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1080                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1081 {
1082         struct tcp_md5sig_pool *hp;
1083         struct hash_desc *desc;
1084
1085         hp = tcp_get_md5sig_pool();
1086         if (!hp)
1087                 goto clear_hash_noput;
1088         desc = &hp->md5_desc;
1089
1090         if (crypto_hash_init(desc))
1091                 goto clear_hash;
1092         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1093                 goto clear_hash;
1094         if (tcp_md5_hash_header(hp, th))
1095                 goto clear_hash;
1096         if (tcp_md5_hash_key(hp, key))
1097                 goto clear_hash;
1098         if (crypto_hash_final(desc, md5_hash))
1099                 goto clear_hash;
1100
1101         tcp_put_md5sig_pool();
1102         return 0;
1103
1104 clear_hash:
1105         tcp_put_md5sig_pool();
1106 clear_hash_noput:
1107         memset(md5_hash, 0, 16);
1108         return 1;
1109 }
1110
1111 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1112                         struct sock *sk, struct request_sock *req,
1113                         struct sk_buff *skb)
1114 {
1115         struct tcp_md5sig_pool *hp;
1116         struct hash_desc *desc;
1117         struct tcphdr *th = tcp_hdr(skb);
1118         __be32 saddr, daddr;
1119
1120         if (sk) {
1121                 saddr = inet_sk(sk)->inet_saddr;
1122                 daddr = inet_sk(sk)->inet_daddr;
1123         } else if (req) {
1124                 saddr = inet_rsk(req)->loc_addr;
1125                 daddr = inet_rsk(req)->rmt_addr;
1126         } else {
1127                 const struct iphdr *iph = ip_hdr(skb);
1128                 saddr = iph->saddr;
1129                 daddr = iph->daddr;
1130         }
1131
1132         hp = tcp_get_md5sig_pool();
1133         if (!hp)
1134                 goto clear_hash_noput;
1135         desc = &hp->md5_desc;
1136
1137         if (crypto_hash_init(desc))
1138                 goto clear_hash;
1139
1140         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1141                 goto clear_hash;
1142         if (tcp_md5_hash_header(hp, th))
1143                 goto clear_hash;
1144         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1145                 goto clear_hash;
1146         if (tcp_md5_hash_key(hp, key))
1147                 goto clear_hash;
1148         if (crypto_hash_final(desc, md5_hash))
1149                 goto clear_hash;
1150
1151         tcp_put_md5sig_pool();
1152         return 0;
1153
1154 clear_hash:
1155         tcp_put_md5sig_pool();
1156 clear_hash_noput:
1157         memset(md5_hash, 0, 16);
1158         return 1;
1159 }
1160 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1161
1162 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1163 {
1164         /*
1165          * This gets called for each TCP segment that arrives
1166          * so we want to be efficient.
1167          * We have 3 drop cases:
1168          * o No MD5 hash and one expected.
1169          * o MD5 hash and we're not expecting one.
1170          * o MD5 hash and its wrong.
1171          */
1172         __u8 *hash_location = NULL;
1173         struct tcp_md5sig_key *hash_expected;
1174         const struct iphdr *iph = ip_hdr(skb);
1175         struct tcphdr *th = tcp_hdr(skb);
1176         int genhash;
1177         unsigned char newhash[16];
1178
1179         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1180         hash_location = tcp_parse_md5sig_option(th);
1181
1182         /* We've parsed the options - do we have a hash? */
1183         if (!hash_expected && !hash_location)
1184                 return 0;
1185
1186         if (hash_expected && !hash_location) {
1187                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1188                 return 1;
1189         }
1190
1191         if (!hash_expected && hash_location) {
1192                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1193                 return 1;
1194         }
1195
1196         /* Okay, so this is hash_expected and hash_location -
1197          * so we need to calculate the checksum.
1198          */
1199         genhash = tcp_v4_md5_hash_skb(newhash,
1200                                       hash_expected,
1201                                       NULL, NULL, skb);
1202
1203         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1204                 if (net_ratelimit()) {
1205                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1206                                &iph->saddr, ntohs(th->source),
1207                                &iph->daddr, ntohs(th->dest),
1208                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1209                 }
1210                 return 1;
1211         }
1212         return 0;
1213 }
1214
1215 #endif
1216
1217 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1218         .family         =       PF_INET,
1219         .obj_size       =       sizeof(struct tcp_request_sock),
1220         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1221         .send_ack       =       tcp_v4_reqsk_send_ack,
1222         .destructor     =       tcp_v4_reqsk_destructor,
1223         .send_reset     =       tcp_v4_send_reset,
1224         .syn_ack_timeout =      tcp_syn_ack_timeout,
1225 };
1226
1227 #ifdef CONFIG_TCP_MD5SIG
1228 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1229         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1230         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1231 };
1232 #endif
1233
1234 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1235 {
1236         struct tcp_extend_values tmp_ext;
1237         struct tcp_options_received tmp_opt;
1238         u8 *hash_location;
1239         struct request_sock *req;
1240         struct inet_request_sock *ireq;
1241         struct tcp_sock *tp = tcp_sk(sk);
1242         struct dst_entry *dst = NULL;
1243         __be32 saddr = ip_hdr(skb)->saddr;
1244         __be32 daddr = ip_hdr(skb)->daddr;
1245         __u32 isn = TCP_SKB_CB(skb)->when;
1246 #ifdef CONFIG_SYN_COOKIES
1247         int want_cookie = 0;
1248 #else
1249 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1250 #endif
1251
1252         /* Never answer to SYNs send to broadcast or multicast */
1253         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1254                 goto drop;
1255
1256         /* TW buckets are converted to open requests without
1257          * limitations, they conserve resources and peer is
1258          * evidently real one.
1259          */
1260         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1261                 if (net_ratelimit())
1262                         syn_flood_warning(skb);
1263 #ifdef CONFIG_SYN_COOKIES
1264                 if (sysctl_tcp_syncookies) {
1265                         want_cookie = 1;
1266                 } else
1267 #endif
1268                 goto drop;
1269         }
1270
1271         /* Accept backlog is full. If we have already queued enough
1272          * of warm entries in syn queue, drop request. It is better than
1273          * clogging syn queue with openreqs with exponentially increasing
1274          * timeout.
1275          */
1276         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1277                 goto drop;
1278
1279         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1280         if (!req)
1281                 goto drop;
1282
1283 #ifdef CONFIG_TCP_MD5SIG
1284         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1285 #endif
1286
1287         tcp_clear_options(&tmp_opt);
1288         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1289         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1290         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1291
1292         if (tmp_opt.cookie_plus > 0 &&
1293             tmp_opt.saw_tstamp &&
1294             !tp->rx_opt.cookie_out_never &&
1295             (sysctl_tcp_cookie_size > 0 ||
1296              (tp->cookie_values != NULL &&
1297               tp->cookie_values->cookie_desired > 0))) {
1298                 u8 *c;
1299                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1300                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1301
1302                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1303                         goto drop_and_release;
1304
1305                 /* Secret recipe starts with IP addresses */
1306                 *mess++ ^= (__force u32)daddr;
1307                 *mess++ ^= (__force u32)saddr;
1308
1309                 /* plus variable length Initiator Cookie */
1310                 c = (u8 *)mess;
1311                 while (l-- > 0)
1312                         *c++ ^= *hash_location++;
1313
1314 #ifdef CONFIG_SYN_COOKIES
1315                 want_cookie = 0;        /* not our kind of cookie */
1316 #endif
1317                 tmp_ext.cookie_out_never = 0; /* false */
1318                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1319         } else if (!tp->rx_opt.cookie_in_always) {
1320                 /* redundant indications, but ensure initialization. */
1321                 tmp_ext.cookie_out_never = 1; /* true */
1322                 tmp_ext.cookie_plus = 0;
1323         } else {
1324                 goto drop_and_release;
1325         }
1326         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1327
1328         if (want_cookie && !tmp_opt.saw_tstamp)
1329                 tcp_clear_options(&tmp_opt);
1330
1331         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1332         tcp_openreq_init(req, &tmp_opt, skb);
1333
1334         ireq = inet_rsk(req);
1335         ireq->loc_addr = daddr;
1336         ireq->rmt_addr = saddr;
1337         ireq->no_srccheck = inet_sk(sk)->transparent;
1338         ireq->opt = tcp_v4_save_options(sk, skb);
1339
1340         if (security_inet_conn_request(sk, skb, req))
1341                 goto drop_and_free;
1342
1343         if (!want_cookie || tmp_opt.tstamp_ok)
1344                 TCP_ECN_create_request(req, tcp_hdr(skb));
1345
1346         if (want_cookie) {
1347                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1348                 req->cookie_ts = tmp_opt.tstamp_ok;
1349         } else if (!isn) {
1350                 struct inet_peer *peer = NULL;
1351                 struct flowi4 fl4;
1352
1353                 /* VJ's idea. We save last timestamp seen
1354                  * from the destination in peer table, when entering
1355                  * state TIME-WAIT, and check against it before
1356                  * accepting new connection request.
1357                  *
1358                  * If "isn" is not zero, this request hit alive
1359                  * timewait bucket, so that all the necessary checks
1360                  * are made in the function processing timewait state.
1361                  */
1362                 if (tmp_opt.saw_tstamp &&
1363                     tcp_death_row.sysctl_tw_recycle &&
1364                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1365                     fl4.daddr == saddr &&
1366                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1367                         inet_peer_refcheck(peer);
1368                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1369                             (s32)(peer->tcp_ts - req->ts_recent) >
1370                                                         TCP_PAWS_WINDOW) {
1371                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1372                                 goto drop_and_release;
1373                         }
1374                 }
1375                 /* Kill the following clause, if you dislike this way. */
1376                 else if (!sysctl_tcp_syncookies &&
1377                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1378                           (sysctl_max_syn_backlog >> 2)) &&
1379                          (!peer || !peer->tcp_ts_stamp) &&
1380                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1381                         /* Without syncookies last quarter of
1382                          * backlog is filled with destinations,
1383                          * proven to be alive.
1384                          * It means that we continue to communicate
1385                          * to destinations, already remembered
1386                          * to the moment of synflood.
1387                          */
1388                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1389                                        &saddr, ntohs(tcp_hdr(skb)->source));
1390                         goto drop_and_release;
1391                 }
1392
1393                 isn = tcp_v4_init_sequence(skb);
1394         }
1395         tcp_rsk(req)->snt_isn = isn;
1396
1397         if (tcp_v4_send_synack(sk, dst, req,
1398                                (struct request_values *)&tmp_ext) ||
1399             want_cookie)
1400                 goto drop_and_free;
1401
1402         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1403         return 0;
1404
1405 drop_and_release:
1406         dst_release(dst);
1407 drop_and_free:
1408         reqsk_free(req);
1409 drop:
1410         return 0;
1411 }
1412 EXPORT_SYMBOL(tcp_v4_conn_request);
1413
1414
1415 /*
1416  * The three way handshake has completed - we got a valid synack -
1417  * now create the new socket.
1418  */
1419 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1420                                   struct request_sock *req,
1421                                   struct dst_entry *dst)
1422 {
1423         struct inet_request_sock *ireq;
1424         struct inet_sock *newinet;
1425         struct tcp_sock *newtp;
1426         struct sock *newsk;
1427 #ifdef CONFIG_TCP_MD5SIG
1428         struct tcp_md5sig_key *key;
1429 #endif
1430         struct ip_options_rcu *inet_opt;
1431
1432         if (sk_acceptq_is_full(sk))
1433                 goto exit_overflow;
1434
1435         newsk = tcp_create_openreq_child(sk, req, skb);
1436         if (!newsk)
1437                 goto exit_nonewsk;
1438
1439         newsk->sk_gso_type = SKB_GSO_TCPV4;
1440
1441         newtp                 = tcp_sk(newsk);
1442         newinet               = inet_sk(newsk);
1443         ireq                  = inet_rsk(req);
1444         newinet->inet_daddr   = ireq->rmt_addr;
1445         newinet->inet_rcv_saddr = ireq->loc_addr;
1446         newinet->inet_saddr           = ireq->loc_addr;
1447         inet_opt              = ireq->opt;
1448         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1449         ireq->opt             = NULL;
1450         newinet->mc_index     = inet_iif(skb);
1451         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1452         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1453         if (inet_opt)
1454                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1455         newinet->inet_id = newtp->write_seq ^ jiffies;
1456
1457         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1458                 goto put_and_exit;
1459
1460         sk_setup_caps(newsk, dst);
1461
1462         tcp_mtup_init(newsk);
1463         tcp_sync_mss(newsk, dst_mtu(dst));
1464         newtp->advmss = dst_metric_advmss(dst);
1465         if (tcp_sk(sk)->rx_opt.user_mss &&
1466             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1467                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1468
1469         tcp_initialize_rcv_mss(newsk);
1470
1471 #ifdef CONFIG_TCP_MD5SIG
1472         /* Copy over the MD5 key from the original socket */
1473         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1474         if (key != NULL) {
1475                 /*
1476                  * We're using one, so create a matching key
1477                  * on the newsk structure. If we fail to get
1478                  * memory, then we end up not copying the key
1479                  * across. Shucks.
1480                  */
1481                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1482                 if (newkey != NULL)
1483                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1484                                           newkey, key->keylen);
1485                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1486         }
1487 #endif
1488
1489         if (__inet_inherit_port(sk, newsk) < 0)
1490                 goto put_and_exit;
1491         __inet_hash_nolisten(newsk, NULL);
1492
1493         return newsk;
1494
1495 exit_overflow:
1496         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1497 exit_nonewsk:
1498         dst_release(dst);
1499 exit:
1500         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1501         return NULL;
1502 put_and_exit:
1503         sock_put(newsk);
1504         goto exit;
1505 }
1506 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1507
1508 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1509 {
1510         struct tcphdr *th = tcp_hdr(skb);
1511         const struct iphdr *iph = ip_hdr(skb);
1512         struct sock *nsk;
1513         struct request_sock **prev;
1514         /* Find possible connection requests. */
1515         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1516                                                        iph->saddr, iph->daddr);
1517         if (req)
1518                 return tcp_check_req(sk, skb, req, prev);
1519
1520         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1521                         th->source, iph->daddr, th->dest, inet_iif(skb));
1522
1523         if (nsk) {
1524                 if (nsk->sk_state != TCP_TIME_WAIT) {
1525                         bh_lock_sock(nsk);
1526                         return nsk;
1527                 }
1528                 inet_twsk_put(inet_twsk(nsk));
1529                 return NULL;
1530         }
1531
1532 #ifdef CONFIG_SYN_COOKIES
1533         if (!th->syn)
1534                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1535 #endif
1536         return sk;
1537 }
1538
1539 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1540 {
1541         const struct iphdr *iph = ip_hdr(skb);
1542
1543         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1544                 if (!tcp_v4_check(skb->len, iph->saddr,
1545                                   iph->daddr, skb->csum)) {
1546                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1547                         return 0;
1548                 }
1549         }
1550
1551         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1552                                        skb->len, IPPROTO_TCP, 0);
1553
1554         if (skb->len <= 76) {
1555                 return __skb_checksum_complete(skb);
1556         }
1557         return 0;
1558 }
1559
1560
1561 /* The socket must have it's spinlock held when we get
1562  * here.
1563  *
1564  * We have a potential double-lock case here, so even when
1565  * doing backlog processing we use the BH locking scheme.
1566  * This is because we cannot sleep with the original spinlock
1567  * held.
1568  */
1569 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1570 {
1571         struct sock *rsk;
1572 #ifdef CONFIG_TCP_MD5SIG
1573         /*
1574          * We really want to reject the packet as early as possible
1575          * if:
1576          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1577          *  o There is an MD5 option and we're not expecting one
1578          */
1579         if (tcp_v4_inbound_md5_hash(sk, skb))
1580                 goto discard;
1581 #endif
1582
1583         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1584                 sock_rps_save_rxhash(sk, skb->rxhash);
1585                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1586                         rsk = sk;
1587                         goto reset;
1588                 }
1589                 return 0;
1590         }
1591
1592         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1593                 goto csum_err;
1594
1595         if (sk->sk_state == TCP_LISTEN) {
1596                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1597                 if (!nsk)
1598                         goto discard;
1599
1600                 if (nsk != sk) {
1601                         sock_rps_save_rxhash(nsk, skb->rxhash);
1602                         if (tcp_child_process(sk, nsk, skb)) {
1603                                 rsk = nsk;
1604                                 goto reset;
1605                         }
1606                         return 0;
1607                 }
1608         } else
1609                 sock_rps_save_rxhash(sk, skb->rxhash);
1610
1611         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1612                 rsk = sk;
1613                 goto reset;
1614         }
1615         return 0;
1616
1617 reset:
1618         tcp_v4_send_reset(rsk, skb);
1619 discard:
1620         kfree_skb(skb);
1621         /* Be careful here. If this function gets more complicated and
1622          * gcc suffers from register pressure on the x86, sk (in %ebx)
1623          * might be destroyed here. This current version compiles correctly,
1624          * but you have been warned.
1625          */
1626         return 0;
1627
1628 csum_err:
1629         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1630         goto discard;
1631 }
1632 EXPORT_SYMBOL(tcp_v4_do_rcv);
1633
1634 /*
1635  *      From tcp_input.c
1636  */
1637
1638 int tcp_v4_rcv(struct sk_buff *skb)
1639 {
1640         const struct iphdr *iph;
1641         struct tcphdr *th;
1642         struct sock *sk;
1643         int ret;
1644         struct net *net = dev_net(skb->dev);
1645
1646         if (skb->pkt_type != PACKET_HOST)
1647                 goto discard_it;
1648
1649         /* Count it even if it's bad */
1650         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1651
1652         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1653                 goto discard_it;
1654
1655         th = tcp_hdr(skb);
1656
1657         if (th->doff < sizeof(struct tcphdr) / 4)
1658                 goto bad_packet;
1659         if (!pskb_may_pull(skb, th->doff * 4))
1660                 goto discard_it;
1661
1662         /* An explanation is required here, I think.
1663          * Packet length and doff are validated by header prediction,
1664          * provided case of th->doff==0 is eliminated.
1665          * So, we defer the checks. */
1666         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1667                 goto bad_packet;
1668
1669         th = tcp_hdr(skb);
1670         iph = ip_hdr(skb);
1671         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1672         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1673                                     skb->len - th->doff * 4);
1674         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1675         TCP_SKB_CB(skb)->when    = 0;
1676         TCP_SKB_CB(skb)->flags   = iph->tos;
1677         TCP_SKB_CB(skb)->sacked  = 0;
1678
1679         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1680         if (!sk)
1681                 goto no_tcp_socket;
1682
1683 process:
1684         if (sk->sk_state == TCP_TIME_WAIT)
1685                 goto do_time_wait;
1686
1687         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1688                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1689                 goto discard_and_relse;
1690         }
1691
1692         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1693                 goto discard_and_relse;
1694         nf_reset(skb);
1695
1696         if (sk_filter(sk, skb))
1697                 goto discard_and_relse;
1698
1699         skb->dev = NULL;
1700
1701         bh_lock_sock_nested(sk);
1702         ret = 0;
1703         if (!sock_owned_by_user(sk)) {
1704 #ifdef CONFIG_NET_DMA
1705                 struct tcp_sock *tp = tcp_sk(sk);
1706                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1707                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1708                 if (tp->ucopy.dma_chan)
1709                         ret = tcp_v4_do_rcv(sk, skb);
1710                 else
1711 #endif
1712                 {
1713                         if (!tcp_prequeue(sk, skb))
1714                                 ret = tcp_v4_do_rcv(sk, skb);
1715                 }
1716         } else if (unlikely(sk_add_backlog(sk, skb))) {
1717                 bh_unlock_sock(sk);
1718                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1719                 goto discard_and_relse;
1720         }
1721         bh_unlock_sock(sk);
1722
1723         sock_put(sk);
1724
1725         return ret;
1726
1727 no_tcp_socket:
1728         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1729                 goto discard_it;
1730
1731         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1732 bad_packet:
1733                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1734         } else {
1735                 tcp_v4_send_reset(NULL, skb);
1736         }
1737
1738 discard_it:
1739         /* Discard frame. */
1740         kfree_skb(skb);
1741         return 0;
1742
1743 discard_and_relse:
1744         sock_put(sk);
1745         goto discard_it;
1746
1747 do_time_wait:
1748         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1749                 inet_twsk_put(inet_twsk(sk));
1750                 goto discard_it;
1751         }
1752
1753         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1754                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1755                 inet_twsk_put(inet_twsk(sk));
1756                 goto discard_it;
1757         }
1758         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1759         case TCP_TW_SYN: {
1760                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1761                                                         &tcp_hashinfo,
1762                                                         iph->daddr, th->dest,
1763                                                         inet_iif(skb));
1764                 if (sk2) {
1765                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1766                         inet_twsk_put(inet_twsk(sk));
1767                         sk = sk2;
1768                         goto process;
1769                 }
1770                 /* Fall through to ACK */
1771         }
1772         case TCP_TW_ACK:
1773                 tcp_v4_timewait_ack(sk, skb);
1774                 break;
1775         case TCP_TW_RST:
1776                 goto no_tcp_socket;
1777         case TCP_TW_SUCCESS:;
1778         }
1779         goto discard_it;
1780 }
1781
1782 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1783 {
1784         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1785         struct inet_sock *inet = inet_sk(sk);
1786         struct inet_peer *peer;
1787
1788         if (!rt ||
1789             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1790                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1791                 *release_it = true;
1792         } else {
1793                 if (!rt->peer)
1794                         rt_bind_peer(rt, inet->inet_daddr, 1);
1795                 peer = rt->peer;
1796                 *release_it = false;
1797         }
1798
1799         return peer;
1800 }
1801 EXPORT_SYMBOL(tcp_v4_get_peer);
1802
1803 void *tcp_v4_tw_get_peer(struct sock *sk)
1804 {
1805         struct inet_timewait_sock *tw = inet_twsk(sk);
1806
1807         return inet_getpeer_v4(tw->tw_daddr, 1);
1808 }
1809 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1810
1811 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1812         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1813         .twsk_unique    = tcp_twsk_unique,
1814         .twsk_destructor= tcp_twsk_destructor,
1815         .twsk_getpeer   = tcp_v4_tw_get_peer,
1816 };
1817
1818 const struct inet_connection_sock_af_ops ipv4_specific = {
1819         .queue_xmit        = ip_queue_xmit,
1820         .send_check        = tcp_v4_send_check,
1821         .rebuild_header    = inet_sk_rebuild_header,
1822         .conn_request      = tcp_v4_conn_request,
1823         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1824         .get_peer          = tcp_v4_get_peer,
1825         .net_header_len    = sizeof(struct iphdr),
1826         .setsockopt        = ip_setsockopt,
1827         .getsockopt        = ip_getsockopt,
1828         .addr2sockaddr     = inet_csk_addr2sockaddr,
1829         .sockaddr_len      = sizeof(struct sockaddr_in),
1830         .bind_conflict     = inet_csk_bind_conflict,
1831 #ifdef CONFIG_COMPAT
1832         .compat_setsockopt = compat_ip_setsockopt,
1833         .compat_getsockopt = compat_ip_getsockopt,
1834 #endif
1835 };
1836 EXPORT_SYMBOL(ipv4_specific);
1837
1838 #ifdef CONFIG_TCP_MD5SIG
1839 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1840         .md5_lookup             = tcp_v4_md5_lookup,
1841         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1842         .md5_add                = tcp_v4_md5_add_func,
1843         .md5_parse              = tcp_v4_parse_md5_keys,
1844 };
1845 #endif
1846
1847 /* NOTE: A lot of things set to zero explicitly by call to
1848  *       sk_alloc() so need not be done here.
1849  */
1850 static int tcp_v4_init_sock(struct sock *sk)
1851 {
1852         struct inet_connection_sock *icsk = inet_csk(sk);
1853         struct tcp_sock *tp = tcp_sk(sk);
1854
1855         skb_queue_head_init(&tp->out_of_order_queue);
1856         tcp_init_xmit_timers(sk);
1857         tcp_prequeue_init(tp);
1858
1859         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1860         tp->mdev = TCP_TIMEOUT_INIT;
1861
1862         /* So many TCP implementations out there (incorrectly) count the
1863          * initial SYN frame in their delayed-ACK and congestion control
1864          * algorithms that we must have the following bandaid to talk
1865          * efficiently to them.  -DaveM
1866          */
1867         tp->snd_cwnd = 2;
1868
1869         /* See draft-stevens-tcpca-spec-01 for discussion of the
1870          * initialization of these values.
1871          */
1872         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1873         tp->snd_cwnd_clamp = ~0;
1874         tp->mss_cache = TCP_MSS_DEFAULT;
1875
1876         tp->reordering = sysctl_tcp_reordering;
1877         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1878
1879         sk->sk_state = TCP_CLOSE;
1880
1881         sk->sk_write_space = sk_stream_write_space;
1882         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1883
1884         icsk->icsk_af_ops = &ipv4_specific;
1885         icsk->icsk_sync_mss = tcp_sync_mss;
1886 #ifdef CONFIG_TCP_MD5SIG
1887         tp->af_specific = &tcp_sock_ipv4_specific;
1888 #endif
1889
1890         /* TCP Cookie Transactions */
1891         if (sysctl_tcp_cookie_size > 0) {
1892                 /* Default, cookies without s_data_payload. */
1893                 tp->cookie_values =
1894                         kzalloc(sizeof(*tp->cookie_values),
1895                                 sk->sk_allocation);
1896                 if (tp->cookie_values != NULL)
1897                         kref_init(&tp->cookie_values->kref);
1898         }
1899         /* Presumed zeroed, in order of appearance:
1900          *      cookie_in_always, cookie_out_never,
1901          *      s_data_constant, s_data_in, s_data_out
1902          */
1903         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1904         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1905
1906         local_bh_disable();
1907         percpu_counter_inc(&tcp_sockets_allocated);
1908         local_bh_enable();
1909
1910         return 0;
1911 }
1912
1913 void tcp_v4_destroy_sock(struct sock *sk)
1914 {
1915         struct tcp_sock *tp = tcp_sk(sk);
1916
1917         tcp_clear_xmit_timers(sk);
1918
1919         tcp_cleanup_congestion_control(sk);
1920
1921         /* Cleanup up the write buffer. */
1922         tcp_write_queue_purge(sk);
1923
1924         /* Cleans up our, hopefully empty, out_of_order_queue. */
1925         __skb_queue_purge(&tp->out_of_order_queue);
1926
1927 #ifdef CONFIG_TCP_MD5SIG
1928         /* Clean up the MD5 key list, if any */
1929         if (tp->md5sig_info) {
1930                 tcp_v4_clear_md5_list(sk);
1931                 kfree(tp->md5sig_info);
1932                 tp->md5sig_info = NULL;
1933         }
1934 #endif
1935
1936 #ifdef CONFIG_NET_DMA
1937         /* Cleans up our sk_async_wait_queue */
1938         __skb_queue_purge(&sk->sk_async_wait_queue);
1939 #endif
1940
1941         /* Clean prequeue, it must be empty really */
1942         __skb_queue_purge(&tp->ucopy.prequeue);
1943
1944         /* Clean up a referenced TCP bind bucket. */
1945         if (inet_csk(sk)->icsk_bind_hash)
1946                 inet_put_port(sk);
1947
1948         /*
1949          * If sendmsg cached page exists, toss it.
1950          */
1951         if (sk->sk_sndmsg_page) {
1952                 __free_page(sk->sk_sndmsg_page);
1953                 sk->sk_sndmsg_page = NULL;
1954         }
1955
1956         /* TCP Cookie Transactions */
1957         if (tp->cookie_values != NULL) {
1958                 kref_put(&tp->cookie_values->kref,
1959                          tcp_cookie_values_release);
1960                 tp->cookie_values = NULL;
1961         }
1962
1963         percpu_counter_dec(&tcp_sockets_allocated);
1964 }
1965 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1966
1967 #ifdef CONFIG_PROC_FS
1968 /* Proc filesystem TCP sock list dumping. */
1969
1970 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1971 {
1972         return hlist_nulls_empty(head) ? NULL :
1973                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1974 }
1975
1976 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1977 {
1978         return !is_a_nulls(tw->tw_node.next) ?
1979                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1980 }
1981
1982 /*
1983  * Get next listener socket follow cur.  If cur is NULL, get first socket
1984  * starting from bucket given in st->bucket; when st->bucket is zero the
1985  * very first socket in the hash table is returned.
1986  */
1987 static void *listening_get_next(struct seq_file *seq, void *cur)
1988 {
1989         struct inet_connection_sock *icsk;
1990         struct hlist_nulls_node *node;
1991         struct sock *sk = cur;
1992         struct inet_listen_hashbucket *ilb;
1993         struct tcp_iter_state *st = seq->private;
1994         struct net *net = seq_file_net(seq);
1995
1996         if (!sk) {
1997                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1998                 spin_lock_bh(&ilb->lock);
1999                 sk = sk_nulls_head(&ilb->head);
2000                 st->offset = 0;
2001                 goto get_sk;
2002         }
2003         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2004         ++st->num;
2005         ++st->offset;
2006
2007         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2008                 struct request_sock *req = cur;
2009
2010                 icsk = inet_csk(st->syn_wait_sk);
2011                 req = req->dl_next;
2012                 while (1) {
2013                         while (req) {
2014                                 if (req->rsk_ops->family == st->family) {
2015                                         cur = req;
2016                                         goto out;
2017                                 }
2018                                 req = req->dl_next;
2019                         }
2020                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2021                                 break;
2022 get_req:
2023                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2024                 }
2025                 sk        = sk_nulls_next(st->syn_wait_sk);
2026                 st->state = TCP_SEQ_STATE_LISTENING;
2027                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2028         } else {
2029                 icsk = inet_csk(sk);
2030                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2031                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2032                         goto start_req;
2033                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034                 sk = sk_nulls_next(sk);
2035         }
2036 get_sk:
2037         sk_nulls_for_each_from(sk, node) {
2038                 if (!net_eq(sock_net(sk), net))
2039                         continue;
2040                 if (sk->sk_family == st->family) {
2041                         cur = sk;
2042                         goto out;
2043                 }
2044                 icsk = inet_csk(sk);
2045                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2046                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2047 start_req:
2048                         st->uid         = sock_i_uid(sk);
2049                         st->syn_wait_sk = sk;
2050                         st->state       = TCP_SEQ_STATE_OPENREQ;
2051                         st->sbucket     = 0;
2052                         goto get_req;
2053                 }
2054                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2055         }
2056         spin_unlock_bh(&ilb->lock);
2057         st->offset = 0;
2058         if (++st->bucket < INET_LHTABLE_SIZE) {
2059                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2060                 spin_lock_bh(&ilb->lock);
2061                 sk = sk_nulls_head(&ilb->head);
2062                 goto get_sk;
2063         }
2064         cur = NULL;
2065 out:
2066         return cur;
2067 }
2068
2069 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2070 {
2071         struct tcp_iter_state *st = seq->private;
2072         void *rc;
2073
2074         st->bucket = 0;
2075         st->offset = 0;
2076         rc = listening_get_next(seq, NULL);
2077
2078         while (rc && *pos) {
2079                 rc = listening_get_next(seq, rc);
2080                 --*pos;
2081         }
2082         return rc;
2083 }
2084
2085 static inline int empty_bucket(struct tcp_iter_state *st)
2086 {
2087         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2088                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2089 }
2090
2091 /*
2092  * Get first established socket starting from bucket given in st->bucket.
2093  * If st->bucket is zero, the very first socket in the hash is returned.
2094  */
2095 static void *established_get_first(struct seq_file *seq)
2096 {
2097         struct tcp_iter_state *st = seq->private;
2098         struct net *net = seq_file_net(seq);
2099         void *rc = NULL;
2100
2101         st->offset = 0;
2102         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2103                 struct sock *sk;
2104                 struct hlist_nulls_node *node;
2105                 struct inet_timewait_sock *tw;
2106                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2107
2108                 /* Lockless fast path for the common case of empty buckets */
2109                 if (empty_bucket(st))
2110                         continue;
2111
2112                 spin_lock_bh(lock);
2113                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2114                         if (sk->sk_family != st->family ||
2115                             !net_eq(sock_net(sk), net)) {
2116                                 continue;
2117                         }
2118                         rc = sk;
2119                         goto out;
2120                 }
2121                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2122                 inet_twsk_for_each(tw, node,
2123                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2124                         if (tw->tw_family != st->family ||
2125                             !net_eq(twsk_net(tw), net)) {
2126                                 continue;
2127                         }
2128                         rc = tw;
2129                         goto out;
2130                 }
2131                 spin_unlock_bh(lock);
2132                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2133         }
2134 out:
2135         return rc;
2136 }
2137
2138 static void *established_get_next(struct seq_file *seq, void *cur)
2139 {
2140         struct sock *sk = cur;
2141         struct inet_timewait_sock *tw;
2142         struct hlist_nulls_node *node;
2143         struct tcp_iter_state *st = seq->private;
2144         struct net *net = seq_file_net(seq);
2145
2146         ++st->num;
2147         ++st->offset;
2148
2149         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2150                 tw = cur;
2151                 tw = tw_next(tw);
2152 get_tw:
2153                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2154                         tw = tw_next(tw);
2155                 }
2156                 if (tw) {
2157                         cur = tw;
2158                         goto out;
2159                 }
2160                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2161                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2162
2163                 /* Look for next non empty bucket */
2164                 st->offset = 0;
2165                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2166                                 empty_bucket(st))
2167                         ;
2168                 if (st->bucket > tcp_hashinfo.ehash_mask)
2169                         return NULL;
2170
2171                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2172                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2173         } else
2174                 sk = sk_nulls_next(sk);
2175
2176         sk_nulls_for_each_from(sk, node) {
2177                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2178                         goto found;
2179         }
2180
2181         st->state = TCP_SEQ_STATE_TIME_WAIT;
2182         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2183         goto get_tw;
2184 found:
2185         cur = sk;
2186 out:
2187         return cur;
2188 }
2189
2190 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2191 {
2192         struct tcp_iter_state *st = seq->private;
2193         void *rc;
2194
2195         st->bucket = 0;
2196         rc = established_get_first(seq);
2197
2198         while (rc && pos) {
2199                 rc = established_get_next(seq, rc);
2200                 --pos;
2201         }
2202         return rc;
2203 }
2204
2205 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2206 {
2207         void *rc;
2208         struct tcp_iter_state *st = seq->private;
2209
2210         st->state = TCP_SEQ_STATE_LISTENING;
2211         rc        = listening_get_idx(seq, &pos);
2212
2213         if (!rc) {
2214                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2215                 rc        = established_get_idx(seq, pos);
2216         }
2217
2218         return rc;
2219 }
2220
2221 static void *tcp_seek_last_pos(struct seq_file *seq)
2222 {
2223         struct tcp_iter_state *st = seq->private;
2224         int offset = st->offset;
2225         int orig_num = st->num;
2226         void *rc = NULL;
2227
2228         switch (st->state) {
2229         case TCP_SEQ_STATE_OPENREQ:
2230         case TCP_SEQ_STATE_LISTENING:
2231                 if (st->bucket >= INET_LHTABLE_SIZE)
2232                         break;
2233                 st->state = TCP_SEQ_STATE_LISTENING;
2234                 rc = listening_get_next(seq, NULL);
2235                 while (offset-- && rc)
2236                         rc = listening_get_next(seq, rc);
2237                 if (rc)
2238                         break;
2239                 st->bucket = 0;
2240                 /* Fallthrough */
2241         case TCP_SEQ_STATE_ESTABLISHED:
2242         case TCP_SEQ_STATE_TIME_WAIT:
2243                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2244                 if (st->bucket > tcp_hashinfo.ehash_mask)
2245                         break;
2246                 rc = established_get_first(seq);
2247                 while (offset-- && rc)
2248                         rc = established_get_next(seq, rc);
2249         }
2250
2251         st->num = orig_num;
2252
2253         return rc;
2254 }
2255
2256 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2257 {
2258         struct tcp_iter_state *st = seq->private;
2259         void *rc;
2260
2261         if (*pos && *pos == st->last_pos) {
2262                 rc = tcp_seek_last_pos(seq);
2263                 if (rc)
2264                         goto out;
2265         }
2266
2267         st->state = TCP_SEQ_STATE_LISTENING;
2268         st->num = 0;
2269         st->bucket = 0;
2270         st->offset = 0;
2271         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2272
2273 out:
2274         st->last_pos = *pos;
2275         return rc;
2276 }
2277
2278 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2279 {
2280         struct tcp_iter_state *st = seq->private;
2281         void *rc = NULL;
2282
2283         if (v == SEQ_START_TOKEN) {
2284                 rc = tcp_get_idx(seq, 0);
2285                 goto out;
2286         }
2287
2288         switch (st->state) {
2289         case TCP_SEQ_STATE_OPENREQ:
2290         case TCP_SEQ_STATE_LISTENING:
2291                 rc = listening_get_next(seq, v);
2292                 if (!rc) {
2293                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2294                         st->bucket = 0;
2295                         st->offset = 0;
2296                         rc        = established_get_first(seq);
2297                 }
2298                 break;
2299         case TCP_SEQ_STATE_ESTABLISHED:
2300         case TCP_SEQ_STATE_TIME_WAIT:
2301                 rc = established_get_next(seq, v);
2302                 break;
2303         }
2304 out:
2305         ++*pos;
2306         st->last_pos = *pos;
2307         return rc;
2308 }
2309
2310 static void tcp_seq_stop(struct seq_file *seq, void *v)
2311 {
2312         struct tcp_iter_state *st = seq->private;
2313
2314         switch (st->state) {
2315         case TCP_SEQ_STATE_OPENREQ:
2316                 if (v) {
2317                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2318                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2319                 }
2320         case TCP_SEQ_STATE_LISTENING:
2321                 if (v != SEQ_START_TOKEN)
2322                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2323                 break;
2324         case TCP_SEQ_STATE_TIME_WAIT:
2325         case TCP_SEQ_STATE_ESTABLISHED:
2326                 if (v)
2327                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2328                 break;
2329         }
2330 }
2331
2332 static int tcp_seq_open(struct inode *inode, struct file *file)
2333 {
2334         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2335         struct tcp_iter_state *s;
2336         int err;
2337
2338         err = seq_open_net(inode, file, &afinfo->seq_ops,
2339                           sizeof(struct tcp_iter_state));
2340         if (err < 0)
2341                 return err;
2342
2343         s = ((struct seq_file *)file->private_data)->private;
2344         s->family               = afinfo->family;
2345         s->last_pos             = 0;
2346         return 0;
2347 }
2348
2349 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2350 {
2351         int rc = 0;
2352         struct proc_dir_entry *p;
2353
2354         afinfo->seq_fops.open           = tcp_seq_open;
2355         afinfo->seq_fops.read           = seq_read;
2356         afinfo->seq_fops.llseek         = seq_lseek;
2357         afinfo->seq_fops.release        = seq_release_net;
2358
2359         afinfo->seq_ops.start           = tcp_seq_start;
2360         afinfo->seq_ops.next            = tcp_seq_next;
2361         afinfo->seq_ops.stop            = tcp_seq_stop;
2362
2363         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2364                              &afinfo->seq_fops, afinfo);
2365         if (!p)
2366                 rc = -ENOMEM;
2367         return rc;
2368 }
2369 EXPORT_SYMBOL(tcp_proc_register);
2370
2371 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2372 {
2373         proc_net_remove(net, afinfo->name);
2374 }
2375 EXPORT_SYMBOL(tcp_proc_unregister);
2376
2377 static void get_openreq4(struct sock *sk, struct request_sock *req,
2378                          struct seq_file *f, int i, int uid, int *len)
2379 {
2380         const struct inet_request_sock *ireq = inet_rsk(req);
2381         int ttd = req->expires - jiffies;
2382
2383         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2384                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2385                 i,
2386                 ireq->loc_addr,
2387                 ntohs(inet_sk(sk)->inet_sport),
2388                 ireq->rmt_addr,
2389                 ntohs(ireq->rmt_port),
2390                 TCP_SYN_RECV,
2391                 0, 0, /* could print option size, but that is af dependent. */
2392                 1,    /* timers active (only the expire timer) */
2393                 jiffies_to_clock_t(ttd),
2394                 req->retrans,
2395                 uid,
2396                 0,  /* non standard timer */
2397                 0, /* open_requests have no inode */
2398                 atomic_read(&sk->sk_refcnt),
2399                 req,
2400                 len);
2401 }
2402
2403 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2404 {
2405         int timer_active;
2406         unsigned long timer_expires;
2407         struct tcp_sock *tp = tcp_sk(sk);
2408         const struct inet_connection_sock *icsk = inet_csk(sk);
2409         struct inet_sock *inet = inet_sk(sk);
2410         __be32 dest = inet->inet_daddr;
2411         __be32 src = inet->inet_rcv_saddr;
2412         __u16 destp = ntohs(inet->inet_dport);
2413         __u16 srcp = ntohs(inet->inet_sport);
2414         int rx_queue;
2415
2416         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2417                 timer_active    = 1;
2418                 timer_expires   = icsk->icsk_timeout;
2419         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2420                 timer_active    = 4;
2421                 timer_expires   = icsk->icsk_timeout;
2422         } else if (timer_pending(&sk->sk_timer)) {
2423                 timer_active    = 2;
2424                 timer_expires   = sk->sk_timer.expires;
2425         } else {
2426                 timer_active    = 0;
2427                 timer_expires = jiffies;
2428         }
2429
2430         if (sk->sk_state == TCP_LISTEN)
2431                 rx_queue = sk->sk_ack_backlog;
2432         else
2433                 /*
2434                  * because we dont lock socket, we might find a transient negative value
2435                  */
2436                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2437
2438         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2439                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2440                 i, src, srcp, dest, destp, sk->sk_state,
2441                 tp->write_seq - tp->snd_una,
2442                 rx_queue,
2443                 timer_active,
2444                 jiffies_to_clock_t(timer_expires - jiffies),
2445                 icsk->icsk_retransmits,
2446                 sock_i_uid(sk),
2447                 icsk->icsk_probes_out,
2448                 sock_i_ino(sk),
2449                 atomic_read(&sk->sk_refcnt), sk,
2450                 jiffies_to_clock_t(icsk->icsk_rto),
2451                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2452                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2453                 tp->snd_cwnd,
2454                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2455                 len);
2456 }
2457
2458 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2459                                struct seq_file *f, int i, int *len)
2460 {
2461         __be32 dest, src;
2462         __u16 destp, srcp;
2463         int ttd = tw->tw_ttd - jiffies;
2464
2465         if (ttd < 0)
2466                 ttd = 0;
2467
2468         dest  = tw->tw_daddr;
2469         src   = tw->tw_rcv_saddr;
2470         destp = ntohs(tw->tw_dport);
2471         srcp  = ntohs(tw->tw_sport);
2472
2473         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2474                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2475                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2476                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2477                 atomic_read(&tw->tw_refcnt), tw, len);
2478 }
2479
2480 #define TMPSZ 150
2481
2482 static int tcp4_seq_show(struct seq_file *seq, void *v)
2483 {
2484         struct tcp_iter_state *st;
2485         int len;
2486
2487         if (v == SEQ_START_TOKEN) {
2488                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2489                            "  sl  local_address rem_address   st tx_queue "
2490                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2491                            "inode");
2492                 goto out;
2493         }
2494         st = seq->private;
2495
2496         switch (st->state) {
2497         case TCP_SEQ_STATE_LISTENING:
2498         case TCP_SEQ_STATE_ESTABLISHED:
2499                 get_tcp4_sock(v, seq, st->num, &len);
2500                 break;
2501         case TCP_SEQ_STATE_OPENREQ:
2502                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2503                 break;
2504         case TCP_SEQ_STATE_TIME_WAIT:
2505                 get_timewait4_sock(v, seq, st->num, &len);
2506                 break;
2507         }
2508         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2509 out:
2510         return 0;
2511 }
2512
2513 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2514         .name           = "tcp",
2515         .family         = AF_INET,
2516         .seq_fops       = {
2517                 .owner          = THIS_MODULE,
2518         },
2519         .seq_ops        = {
2520                 .show           = tcp4_seq_show,
2521         },
2522 };
2523
2524 static int __net_init tcp4_proc_init_net(struct net *net)
2525 {
2526         return tcp_proc_register(net, &tcp4_seq_afinfo);
2527 }
2528
2529 static void __net_exit tcp4_proc_exit_net(struct net *net)
2530 {
2531         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2532 }
2533
2534 static struct pernet_operations tcp4_net_ops = {
2535         .init = tcp4_proc_init_net,
2536         .exit = tcp4_proc_exit_net,
2537 };
2538
2539 int __init tcp4_proc_init(void)
2540 {
2541         return register_pernet_subsys(&tcp4_net_ops);
2542 }
2543
2544 void tcp4_proc_exit(void)
2545 {
2546         unregister_pernet_subsys(&tcp4_net_ops);
2547 }
2548 #endif /* CONFIG_PROC_FS */
2549
2550 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2551 {
2552         const struct iphdr *iph = skb_gro_network_header(skb);
2553
2554         switch (skb->ip_summed) {
2555         case CHECKSUM_COMPLETE:
2556                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2557                                   skb->csum)) {
2558                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2559                         break;
2560                 }
2561
2562                 /* fall through */
2563         case CHECKSUM_NONE:
2564                 NAPI_GRO_CB(skb)->flush = 1;
2565                 return NULL;
2566         }
2567
2568         return tcp_gro_receive(head, skb);
2569 }
2570
2571 int tcp4_gro_complete(struct sk_buff *skb)
2572 {
2573         const struct iphdr *iph = ip_hdr(skb);
2574         struct tcphdr *th = tcp_hdr(skb);
2575
2576         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2577                                   iph->saddr, iph->daddr, 0);
2578         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2579
2580         return tcp_gro_complete(skb);
2581 }
2582
2583 struct proto tcp_prot = {
2584         .name                   = "TCP",
2585         .owner                  = THIS_MODULE,
2586         .close                  = tcp_close,
2587         .connect                = tcp_v4_connect,
2588         .disconnect             = tcp_disconnect,
2589         .accept                 = inet_csk_accept,
2590         .ioctl                  = tcp_ioctl,
2591         .init                   = tcp_v4_init_sock,
2592         .destroy                = tcp_v4_destroy_sock,
2593         .shutdown               = tcp_shutdown,
2594         .setsockopt             = tcp_setsockopt,
2595         .getsockopt             = tcp_getsockopt,
2596         .recvmsg                = tcp_recvmsg,
2597         .sendmsg                = tcp_sendmsg,
2598         .sendpage               = tcp_sendpage,
2599         .backlog_rcv            = tcp_v4_do_rcv,
2600         .hash                   = inet_hash,
2601         .unhash                 = inet_unhash,
2602         .get_port               = inet_csk_get_port,
2603         .enter_memory_pressure  = tcp_enter_memory_pressure,
2604         .sockets_allocated      = &tcp_sockets_allocated,
2605         .orphan_count           = &tcp_orphan_count,
2606         .memory_allocated       = &tcp_memory_allocated,
2607         .memory_pressure        = &tcp_memory_pressure,
2608         .sysctl_mem             = sysctl_tcp_mem,
2609         .sysctl_wmem            = sysctl_tcp_wmem,
2610         .sysctl_rmem            = sysctl_tcp_rmem,
2611         .max_header             = MAX_TCP_HEADER,
2612         .obj_size               = sizeof(struct tcp_sock),
2613         .slab_flags             = SLAB_DESTROY_BY_RCU,
2614         .twsk_prot              = &tcp_timewait_sock_ops,
2615         .rsk_prot               = &tcp_request_sock_ops,
2616         .h.hashinfo             = &tcp_hashinfo,
2617         .no_autobind            = true,
2618 #ifdef CONFIG_COMPAT
2619         .compat_setsockopt      = compat_tcp_setsockopt,
2620         .compat_getsockopt      = compat_tcp_getsockopt,
2621 #endif
2622 };
2623 EXPORT_SYMBOL(tcp_prot);
2624
2625
2626 static int __net_init tcp_sk_init(struct net *net)
2627 {
2628         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2629                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2630 }
2631
2632 static void __net_exit tcp_sk_exit(struct net *net)
2633 {
2634         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2635 }
2636
2637 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2638 {
2639         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2640 }
2641
2642 static struct pernet_operations __net_initdata tcp_sk_ops = {
2643        .init       = tcp_sk_init,
2644        .exit       = tcp_sk_exit,
2645        .exit_batch = tcp_sk_exit_batch,
2646 };
2647
2648 void __init tcp_v4_init(void)
2649 {
2650         inet_hashinfo_init(&tcp_hashinfo);
2651         if (register_pernet_subsys(&tcp_sk_ops))
2652                 panic("Failed to create the TCP control socket.\n");
2653 }