net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63 #include <linux/slab.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75 #include <net/secure_seq.h>
  76
  77 #include <linux/inet.h>
  78 #include <linux/ipv6.h>
  79 #include <linux/stddef.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/seq_file.h>
  82
  83 #include <linux/crypto.h>
  84 #include <linux/scatterlist.h>
  85
  86 int sysctl_tcp_tw_reuse __read_mostly;
  87 int sysctl_tcp_low_latency __read_mostly;
  88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  89
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  96 #else
  97 static inline
  98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  99 {
 100         return NULL;
 101 }
 102 #endif
 103
 104 struct inet_hashinfo tcp_hashinfo;
 105 EXPORT_SYMBOL(tcp_hashinfo);
 106
 107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 108 {
 109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 110                                           ip_hdr(skb)->saddr,
 111                                           tcp_hdr(skb)->dest,
 112                                           tcp_hdr(skb)->source);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 118         struct tcp_sock *tp = tcp_sk(sk);
 119
 120         /* With PAWS, it is safe from the viewpoint
 121            of data integrity. Even without PAWS it is safe provided sequence
 122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 123
 124            Actually, the idea is close to VJ's one, only timestamp cache is
 125            held not per host, but per port pair and TW bucket is used as state
 126            holder.
 127
 128            If TW bucket has been already destroyed we fall back to VJ's scheme
 129            and use initial timestamp retrieved from peer table.
 130          */
 131         if (tcptw->tw_ts_recent_stamp &&
 132             (twp == NULL || (sysctl_tcp_tw_reuse &&
 133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 135                 if (tp->write_seq == 0)
 136                         tp->write_seq = 1;
 137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 139                 sock_hold(sktw);
 140                 return 1;
 141         }
 142
 143         return 0;
 144 }
 145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147 /* This will initiate an outgoing connection. */
 148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149 {
 150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151         struct inet_sock *inet = inet_sk(sk);
 152         struct tcp_sock *tp = tcp_sk(sk);
 153         __be16 orig_sport, orig_dport;
 154         __be32 daddr, nexthop;
 155         struct flowi4 *fl4;
 156         struct rtable *rt;
 157         int err;
 158         struct ip_options_rcu *inet_opt;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         inet_opt = rcu_dereference_protected(inet->inet_opt,
 168                                              sock_owned_by_user(sk));
 169         if (inet_opt && inet_opt->opt.srr) {
 170                 if (!daddr)
 171                         return -EINVAL;
 172                 nexthop = inet_opt->opt.faddr;
 173         }
 174
 175         orig_sport = inet->inet_sport;
 176         orig_dport = usin->sin_port;
 177         fl4 = &inet->cork.fl.u.ip4;
 178         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 179                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                               IPPROTO_TCP,
 181                               orig_sport, orig_dport, sk, true);
 182         if (IS_ERR(rt)) {
 183                 err = PTR_ERR(rt);
 184                 if (err == -ENETUNREACH)
 185                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 186                 return err;
 187         }
 188
 189         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 190                 ip_rt_put(rt);
 191                 return -ENETUNREACH;
 192         }
 193
 194         if (!inet_opt || !inet_opt->opt.srr)
 195                 daddr = fl4->daddr;
 196
 197         if (!inet->inet_saddr)
 198                 inet->inet_saddr = fl4->saddr;
 199         inet->inet_rcv_saddr = inet->inet_saddr;
 200
 201         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 202                 /* Reset inherited state */
 203                 tp->rx_opt.ts_recent       = 0;
 204                 tp->rx_opt.ts_recent_stamp = 0;
 205                 tp->write_seq              = 0;
 206         }
 207
 208         if (tcp_death_row.sysctl_tw_recycle &&
 209             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 210                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 211                 /*
 212                  * VJ's idea. We save last timestamp seen from
 213                  * the destination in peer table, when entering state
 214                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 215                  * when trying new connection.
 216                  */
 217                 if (peer) {
 218                         inet_peer_refcheck(peer);
 219                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 220                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 221                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 222                         }
 223                 }
 224         }
 225
 226         inet->inet_dport = usin->sin_port;
 227         inet->inet_daddr = daddr;
 228
 229         inet_csk(sk)->icsk_ext_hdr_len = 0;
 230         if (inet_opt)
 231                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 232
 233         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 234
 235         /* Socket identity is still unknown (sport may be zero).
 236          * However we set state to SYN-SENT and not releasing socket
 237          * lock select source port, enter ourselves into the hash tables and
 238          * complete initialization after this.
 239          */
 240         tcp_set_state(sk, TCP_SYN_SENT);
 241         err = inet_hash_connect(&tcp_death_row, sk);
 242         if (err)
 243                 goto failure;
 244
 245         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 246                                inet->inet_sport, inet->inet_dport, sk);
 247         if (IS_ERR(rt)) {
 248                 err = PTR_ERR(rt);
 249                 rt = NULL;
 250                 goto failure;
 251         }
 252         /* OK, now commit destination to socket.  */
 253         sk->sk_gso_type = SKB_GSO_TCPV4;
 254         sk_setup_caps(sk, &rt->dst);
 255
 256         if (!tp->write_seq)
 257                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 258                                                            inet->inet_daddr,
 259                                                            inet->inet_sport,
 260                                                            usin->sin_port);
 261
 262         inet->inet_id = tp->write_seq ^ jiffies;
 263
 264         err = tcp_connect(sk);
 265         rt = NULL;
 266         if (err)
 267                 goto failure;
 268
 269         return 0;
 270
 271 failure:
 272         /*
 273          * This unhashes the socket and releases the local port,
 274          * if necessary.
 275          */
 276         tcp_set_state(sk, TCP_CLOSE);
 277         ip_rt_put(rt);
 278         sk->sk_route_caps = 0;
 279         inet->inet_dport = 0;
 280         return err;
 281 }
 282 EXPORT_SYMBOL(tcp_v4_connect);
 283
 284 /*
 285  * This routine does path mtu discovery as defined in RFC1191.
 286  */
 287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 288 {
 289         struct dst_entry *dst;
 290         struct inet_sock *inet = inet_sk(sk);
 291
 292         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 293          * send out by Linux are always <576bytes so they should go through
 294          * unfragmented).
 295          */
 296         if (sk->sk_state == TCP_LISTEN)
 297                 return;
 298
 299         /* We don't check in the destentry if pmtu discovery is forbidden
 300          * on this route. We just assume that no packet_to_big packets
 301          * are send back when pmtu discovery is not active.
 302          * There is a small race when the user changes this flag in the
 303          * route, but I think that's acceptable.
 304          */
 305         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 306                 return;
 307
 308         dst->ops->update_pmtu(dst, mtu);
 309
 310         /* Something is about to be wrong... Remember soft error
 311          * for the case, if this connection will not able to recover.
 312          */
 313         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 314                 sk->sk_err_soft = EMSGSIZE;
 315
 316         mtu = dst_mtu(dst);
 317
 318         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 319             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 320                 tcp_sync_mss(sk, mtu);
 321
 322                 /* Resend the TCP packet because it's
 323                  * clear that the old packet has been
 324                  * dropped. This is the new "fast" path mtu
 325                  * discovery.
 326                  */
 327                 tcp_simple_retransmit(sk);
 328         } /* else let the usual retransmit timer handle it */
 329 }
 330
 331 /*
 332  * This routine is called by the ICMP module when it gets some
 333  * sort of error condition.  If err < 0 then the socket should
 334  * be closed and the error returned to the user.  If err > 0
 335  * it's just the icmp type << 8 | icmp code.  After adjustment
 336  * header points to the first 8 bytes of the tcp header.  We need
 337  * to find the appropriate port.
 338  *
 339  * The locking strategy used here is very "optimistic". When
 340  * someone else accesses the socket the ICMP is just dropped
 341  * and for some paths there is no check at all.
 342  * A more general error queue to queue errors for later handling
 343  * is probably better.
 344  *
 345  */
 346
 347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 348 {
 349         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 350         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 351         struct inet_connection_sock *icsk;
 352         struct tcp_sock *tp;
 353         struct inet_sock *inet;
 354         const int type = icmp_hdr(icmp_skb)->type;
 355         const int code = icmp_hdr(icmp_skb)->code;
 356         struct sock *sk;
 357         struct sk_buff *skb;
 358         __u32 seq;
 359         __u32 remaining;
 360         int err;
 361         struct net *net = dev_net(icmp_skb->dev);
 362
 363         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367
 368         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 369                         iph->saddr, th->source, inet_iif(icmp_skb));
 370         if (!sk) {
 371                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 372                 return;
 373         }
 374         if (sk->sk_state == TCP_TIME_WAIT) {
 375                 inet_twsk_put(inet_twsk(sk));
 376                 return;
 377         }
 378
 379         bh_lock_sock(sk);
 380         /* If too many ICMPs get dropped on busy
 381          * servers this needs to be solved differently.
 382          */
 383         if (sock_owned_by_user(sk))
 384                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 385
 386         if (sk->sk_state == TCP_CLOSE)
 387                 goto out;
 388
 389         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 390                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 391                 goto out;
 392         }
 393
 394         icsk = inet_csk(sk);
 395         tp = tcp_sk(sk);
 396         seq = ntohl(th->seq);
 397         if (sk->sk_state != TCP_LISTEN &&
 398             !between(seq, tp->snd_una, tp->snd_nxt)) {
 399                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 400                 goto out;
 401         }
 402
 403         switch (type) {
 404         case ICMP_SOURCE_QUENCH:
 405                 /* Just silently ignore these. */
 406                 goto out;
 407         case ICMP_PARAMETERPROB:
 408                 err = EPROTO;
 409                 break;
 410         case ICMP_DEST_UNREACH:
 411                 if (code > NR_ICMP_UNREACH)
 412                         goto out;
 413
 414                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 415                         if (!sock_owned_by_user(sk))
 416                                 do_pmtu_discovery(sk, iph, info);
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 434                                          icsk->icsk_backoff;
 435                 tcp_bound_rto(sk);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 441                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 442
 443                 if (remaining) {
 444                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                   remaining, TCP_RTO_MAX);
 446                 } else {
 447                         /* RTO revert clocked out retransmission.
 448                          * Will retransmit now */
 449                         tcp_retransmit_timer(sk);
 450                 }
 451
 452                 break;
 453         case ICMP_TIME_EXCEEDED:
 454                 err = EHOSTUNREACH;
 455                 break;
 456         default:
 457                 goto out;
 458         }
 459
 460         switch (sk->sk_state) {
 461                 struct request_sock *req, **prev;
 462         case TCP_LISTEN:
 463                 if (sock_owned_by_user(sk))
 464                         goto out;
 465
 466                 req = inet_csk_search_req(sk, &prev, th->dest,
 467                                           iph->daddr, iph->saddr);
 468                 if (!req)
 469                         goto out;
 470
 471                 /* ICMPs are not backlogged, hence we cannot get
 472                    an established socket here.
 473                  */
 474                 WARN_ON(req->sk);
 475
 476                 if (seq != tcp_rsk(req)->snt_isn) {
 477                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                         goto out;
 479                 }
 480
 481                 /*
 482                  * Still in SYN_RECV, just remove it silently.
 483                  * There is no good way to pass the error to the newly
 484                  * created socket, and POSIX does not want network
 485                  * errors returned from accept().
 486                  */
 487                 inet_csk_reqsk_queue_drop(sk, req, prev);
 488                 goto out;
 489
 490         case TCP_SYN_SENT:
 491         case TCP_SYN_RECV:  /* Cannot happen.
 492                                It can f.e. if SYNs crossed.
 493                              */
 494                 if (!sock_owned_by_user(sk)) {
 495                         sk->sk_err = err;
 496
 497                         sk->sk_error_report(sk);
 498
 499                         tcp_done(sk);
 500                 } else {
 501                         sk->sk_err_soft = err;
 502                 }
 503                 goto out;
 504         }
 505
 506         /* If we've already connected we will keep trying
 507          * until we time out, or the user gives up.
 508          *
 509          * rfc1122 4.2.3.9 allows to consider as hard errors
 510          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 511          * but it is obsoleted by pmtu discovery).
 512          *
 513          * Note, that in modern internet, where routing is unreliable
 514          * and in each dark corner broken firewalls sit, sending random
 515          * errors ordered by their masters even this two messages finally lose
 516          * their original sense (even Linux sends invalid PORT_UNREACHs)
 517          *
 518          * Now we are in compliance with RFCs.
 519          *                                                      --ANK (980905)
 520          */
 521
 522         inet = inet_sk(sk);
 523         if (!sock_owned_by_user(sk) && inet->recverr) {
 524                 sk->sk_err = err;
 525                 sk->sk_error_report(sk);
 526         } else  { /* Only an error on timeout */
 527                 sk->sk_err_soft = err;
 528         }
 529
 530 out:
 531         bh_unlock_sock(sk);
 532         sock_put(sk);
 533 }
 534
 535 static void __tcp_v4_send_check(struct sk_buff *skb,
 536                                 __be32 saddr, __be32 daddr)
 537 {
 538         struct tcphdr *th = tcp_hdr(skb);
 539
 540         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 541                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 542                 skb->csum_start = skb_transport_header(skb) - skb->head;
 543                 skb->csum_offset = offsetof(struct tcphdr, check);
 544         } else {
 545                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 546                                          csum_partial(th,
 547                                                       th->doff << 2,
 548                                                       skb->csum));
 549         }
 550 }
 551
 552 /* This routine computes an IPv4 TCP checksum. */
 553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556
 557         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 558 }
 559 EXPORT_SYMBOL(tcp_v4_send_check);
 560
 561 int tcp_v4_gso_send_check(struct sk_buff *skb)
 562 {
 563         const struct iphdr *iph;
 564         struct tcphdr *th;
 565
 566         if (!pskb_may_pull(skb, sizeof(*th)))
 567                 return -EINVAL;
 568
 569         iph = ip_hdr(skb);
 570         th = tcp_hdr(skb);
 571
 572         th->check = 0;
 573         skb->ip_summed = CHECKSUM_PARTIAL;
 574         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 575         return 0;
 576 }
 577
 578 /*
 579  *      This routine will send an RST to the other tcp.
 580  *
 581  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 582  *                    for reset.
 583  *      Answer: if a packet caused RST, it is not for a socket
 584  *              existing in our system, if it is matched to a socket,
 585  *              it is just duplicate segment or bug in other side's TCP.
 586  *              So that we build reply only basing on parameters
 587  *              arrived with segment.
 588  *      Exception: precedence violation. We do not implement it in any case.
 589  */
 590
 591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 592 {
 593         struct tcphdr *th = tcp_hdr(skb);
 594         struct {
 595                 struct tcphdr th;
 596 #ifdef CONFIG_TCP_MD5SIG
 597                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 598 #endif
 599         } rep;
 600         struct ip_reply_arg arg;
 601 #ifdef CONFIG_TCP_MD5SIG
 602         struct tcp_md5sig_key *key;
 603 #endif
 604         struct net *net;
 605
 606         /* Never send a reset in response to a reset. */
 607         if (th->rst)
 608                 return;
 609
 610         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 611                 return;
 612
 613         /* Swap the send and the receive. */
 614         memset(&rep, 0, sizeof(rep));
 615         rep.th.dest   = th->source;
 616         rep.th.source = th->dest;
 617         rep.th.doff   = sizeof(struct tcphdr) / 4;
 618         rep.th.rst    = 1;
 619
 620         if (th->ack) {
 621                 rep.th.seq = th->ack_seq;
 622         } else {
 623                 rep.th.ack = 1;
 624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 625                                        skb->len - (th->doff << 2));
 626         }
 627
 628         memset(&arg, 0, sizeof(arg));
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631
 632 #ifdef CONFIG_TCP_MD5SIG
 633         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
 634         if (key) {
 635                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 636                                    (TCPOPT_NOP << 16) |
 637                                    (TCPOPT_MD5SIG << 8) |
 638                                    TCPOLEN_MD5SIG);
 639                 /* Update length and the length the header thinks exists */
 640                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 641                 rep.th.doff = arg.iov[0].iov_len / 4;
 642
 643                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 644                                      key, ip_hdr(skb)->saddr,
 645                                      ip_hdr(skb)->daddr, &rep.th);
 646         }
 647 #endif
 648         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 649                                       ip_hdr(skb)->saddr, /* XXX */
 650                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 651         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 652         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 653         /* When socket is gone, all binding information is lost.
 654          * routing might fail in this case. No choice here, if we choose to force
 655          * input interface, we will misroute in case of asymmetric route.
 656          */
 657         if (sk)
 658                 arg.bound_dev_if = sk->sk_bound_dev_if;
 659
 660         net = dev_net(skb_dst(skb)->dev);
 661         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 662                       &arg, arg.iov[0].iov_len);
 663
 664         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 665         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 666 }
 667
 668 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 669    outside socket context is ugly, certainly. What can I do?
 670  */
 671
 672 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 673                             u32 win, u32 ts, int oif,
 674                             struct tcp_md5sig_key *key,
 675                             int reply_flags)
 676 {
 677         struct tcphdr *th = tcp_hdr(skb);
 678         struct {
 679                 struct tcphdr th;
 680                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 681 #ifdef CONFIG_TCP_MD5SIG
 682                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 683 #endif
 684                         ];
 685         } rep;
 686         struct ip_reply_arg arg;
 687         struct net *net = dev_net(skb_dst(skb)->dev);
 688
 689         memset(&rep.th, 0, sizeof(struct tcphdr));
 690         memset(&arg, 0, sizeof(arg));
 691
 692         arg.iov[0].iov_base = (unsigned char *)&rep;
 693         arg.iov[0].iov_len  = sizeof(rep.th);
 694         if (ts) {
 695                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 696                                    (TCPOPT_TIMESTAMP << 8) |
 697                                    TCPOLEN_TIMESTAMP);
 698                 rep.opt[1] = htonl(tcp_time_stamp);
 699                 rep.opt[2] = htonl(ts);
 700                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 701         }
 702
 703         /* Swap the send and the receive. */
 704         rep.th.dest    = th->source;
 705         rep.th.source  = th->dest;
 706         rep.th.doff    = arg.iov[0].iov_len / 4;
 707         rep.th.seq     = htonl(seq);
 708         rep.th.ack_seq = htonl(ack);
 709         rep.th.ack     = 1;
 710         rep.th.window  = htons(win);
 711
 712 #ifdef CONFIG_TCP_MD5SIG
 713         if (key) {
 714                 int offset = (ts) ? 3 : 0;
 715
 716                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 717                                           (TCPOPT_NOP << 16) |
 718                                           (TCPOPT_MD5SIG << 8) |
 719                                           TCPOLEN_MD5SIG);
 720                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 721                 rep.th.doff = arg.iov[0].iov_len/4;
 722
 723                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 724                                     key, ip_hdr(skb)->saddr,
 725                                     ip_hdr(skb)->daddr, &rep.th);
 726         }
 727 #endif
 728         arg.flags = reply_flags;
 729         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 730                                       ip_hdr(skb)->saddr, /* XXX */
 731                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 732         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 733         if (oif)
 734                 arg.bound_dev_if = oif;
 735
 736         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 737                       &arg, arg.iov[0].iov_len);
 738
 739         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 740 }
 741
 742 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 743 {
 744         struct inet_timewait_sock *tw = inet_twsk(sk);
 745         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 746
 747         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 748                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 749                         tcptw->tw_ts_recent,
 750                         tw->tw_bound_dev_if,
 751                         tcp_twsk_md5_key(tcptw),
 752                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 753                         );
 754
 755         inet_twsk_put(tw);
 756 }
 757
 758 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 759                                   struct request_sock *req)
 760 {
 761         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 762                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 763                         req->ts_recent,
 764                         0,
 765                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 766                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 767 }
 768
 769 /*
 770  *      Send a SYN-ACK after having received a SYN.
 771  *      This still operates on a request_sock only, not on a big
 772  *      socket.
 773  */
 774 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 775                               struct request_sock *req,
 776                               struct request_values *rvp)
 777 {
 778         const struct inet_request_sock *ireq = inet_rsk(req);
 779         struct flowi4 fl4;
 780         int err = -1;
 781         struct sk_buff * skb;
 782
 783         /* First, grab a route. */
 784         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 785                 return -1;
 786
 787         skb = tcp_make_synack(sk, dst, req, rvp);
 788
 789         if (skb) {
 790                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 791
 792                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 793                                             ireq->rmt_addr,
 794                                             ireq->opt);
 795                 err = net_xmit_eval(err);
 796         }
 797
 798         dst_release(dst);
 799         return err;
 800 }
 801
 802 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 803                               struct request_values *rvp)
 804 {
 805         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 806         return tcp_v4_send_synack(sk, NULL, req, rvp);
 807 }
 808
 809 /*
 810  *      IPv4 request_sock destructor.
 811  */
 812 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 813 {
 814         kfree(inet_rsk(req)->opt);
 815 }
 816
 817 static void syn_flood_warning(const struct sk_buff *skb)
 818 {
 819         const char *msg;
 820
 821 #ifdef CONFIG_SYN_COOKIES
 822         if (sysctl_tcp_syncookies)
 823                 msg = "Sending cookies";
 824         else
 825 #endif
 826                 msg = "Dropping request";
 827
 828         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 829                                 ntohs(tcp_hdr(skb)->dest), msg);
 830 }
 831
 832 /*
 833  * Save and compile IPv4 options into the request_sock if needed.
 834  */
 835 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 836                                                   struct sk_buff *skb)
 837 {
 838         const struct ip_options *opt = &(IPCB(skb)->opt);
 839         struct ip_options_rcu *dopt = NULL;
 840
 841         if (opt && opt->optlen) {
 842                 int opt_size = sizeof(*dopt) + opt->optlen;
 843
 844                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 845                 if (dopt) {
 846                         if (ip_options_echo(&dopt->opt, skb)) {
 847                                 kfree(dopt);
 848                                 dopt = NULL;
 849                         }
 850                 }
 851         }
 852         return dopt;
 853 }
 854
 855 #ifdef CONFIG_TCP_MD5SIG
 856 /*
 857  * RFC2385 MD5 checksumming requires a mapping of
 858  * IP address->MD5 Key.
 859  * We need to maintain these in the sk structure.
 860  */
 861
 862 /* Find the Key structure for an address.  */
 863 static struct tcp_md5sig_key *
 864                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 865 {
 866         struct tcp_sock *tp = tcp_sk(sk);
 867         int i;
 868
 869         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 870                 return NULL;
 871         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 872                 if (tp->md5sig_info->keys4[i].addr == addr)
 873                         return &tp->md5sig_info->keys4[i].base;
 874         }
 875         return NULL;
 876 }
 877
 878 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 879                                          struct sock *addr_sk)
 880 {
 881         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 882 }
 883 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 884
 885 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 886                                                       struct request_sock *req)
 887 {
 888         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 889 }
 890
 891 /* This can be called on a newly created socket, from other files */
 892 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 893                       u8 *newkey, u8 newkeylen)
 894 {
 895         /* Add Key to the list */
 896         struct tcp_md5sig_key *key;
 897         struct tcp_sock *tp = tcp_sk(sk);
 898         struct tcp4_md5sig_key *keys;
 899
 900         key = tcp_v4_md5_do_lookup(sk, addr);
 901         if (key) {
 902                 /* Pre-existing entry - just update that one. */
 903                 kfree(key->key);
 904                 key->key = newkey;
 905                 key->keylen = newkeylen;
 906         } else {
 907                 struct tcp_md5sig_info *md5sig;
 908
 909                 if (!tp->md5sig_info) {
 910                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 911                                                   GFP_ATOMIC);
 912                         if (!tp->md5sig_info) {
 913                                 kfree(newkey);
 914                                 return -ENOMEM;
 915                         }
 916                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 917                 }
 918
 919                 md5sig = tp->md5sig_info;
 920                 if (md5sig->entries4 == 0 &&
 921                     tcp_alloc_md5sig_pool(sk) == NULL) {
 922                         kfree(newkey);
 923                         return -ENOMEM;
 924                 }
 925
 926                 if (md5sig->alloced4 == md5sig->entries4) {
 927                         keys = kmalloc((sizeof(*keys) *
 928                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 929                         if (!keys) {
 930                                 kfree(newkey);
 931                                 if (md5sig->entries4 == 0)
 932                                         tcp_free_md5sig_pool();
 933                                 return -ENOMEM;
 934                         }
 935
 936                         if (md5sig->entries4)
 937                                 memcpy(keys, md5sig->keys4,
 938                                        sizeof(*keys) * md5sig->entries4);
 939
 940                         /* Free old key list, and reference new one */
 941                         kfree(md5sig->keys4);
 942                         md5sig->keys4 = keys;
 943                         md5sig->alloced4++;
 944                 }
 945                 md5sig->entries4++;
 946                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 947                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 948                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 949         }
 950         return 0;
 951 }
 952 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 953
 954 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 955                                u8 *newkey, u8 newkeylen)
 956 {
 957         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 958                                  newkey, newkeylen);
 959 }
 960
 961 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 962 {
 963         struct tcp_sock *tp = tcp_sk(sk);
 964         int i;
 965
 966         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 967                 if (tp->md5sig_info->keys4[i].addr == addr) {
 968                         /* Free the key */
 969                         kfree(tp->md5sig_info->keys4[i].base.key);
 970                         tp->md5sig_info->entries4--;
 971
 972                         if (tp->md5sig_info->entries4 == 0) {
 973                                 kfree(tp->md5sig_info->keys4);
 974                                 tp->md5sig_info->keys4 = NULL;
 975                                 tp->md5sig_info->alloced4 = 0;
 976                                 tcp_free_md5sig_pool();
 977                         } else if (tp->md5sig_info->entries4 != i) {
 978                                 /* Need to do some manipulation */
 979                                 memmove(&tp->md5sig_info->keys4[i],
 980                                         &tp->md5sig_info->keys4[i+1],
 981                                         (tp->md5sig_info->entries4 - i) *
 982                                          sizeof(struct tcp4_md5sig_key));
 983                         }
 984                         return 0;
 985                 }
 986         }
 987         return -ENOENT;
 988 }
 989 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 990
 991 static void tcp_v4_clear_md5_list(struct sock *sk)
 992 {
 993         struct tcp_sock *tp = tcp_sk(sk);
 994
 995         /* Free each key, then the set of key keys,
 996          * the crypto element, and then decrement our
 997          * hold on the last resort crypto.
 998          */
 999         if (tp->md5sig_info->entries4) {
1000                 int i;
1001                 for (i = 0; i < tp->md5sig_info->entries4; i++)
1002                         kfree(tp->md5sig_info->keys4[i].base.key);
1003                 tp->md5sig_info->entries4 = 0;
1004                 tcp_free_md5sig_pool();
1005         }
1006         if (tp->md5sig_info->keys4) {
1007                 kfree(tp->md5sig_info->keys4);
1008                 tp->md5sig_info->keys4 = NULL;
1009                 tp->md5sig_info->alloced4  = 0;
1010         }
1011 }
1012
1013 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1014                                  int optlen)
1015 {
1016         struct tcp_md5sig cmd;
1017         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1018         u8 *newkey;
1019
1020         if (optlen < sizeof(cmd))
1021                 return -EINVAL;
1022
1023         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1024                 return -EFAULT;
1025
1026         if (sin->sin_family != AF_INET)
1027                 return -EINVAL;
1028
1029         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1030                 if (!tcp_sk(sk)->md5sig_info)
1031                         return -ENOENT;
1032                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1033         }
1034
1035         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1036                 return -EINVAL;
1037
1038         if (!tcp_sk(sk)->md5sig_info) {
1039                 struct tcp_sock *tp = tcp_sk(sk);
1040                 struct tcp_md5sig_info *p;
1041
1042                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1043                 if (!p)
1044                         return -EINVAL;
1045
1046                 tp->md5sig_info = p;
1047                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1048         }
1049
1050         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1051         if (!newkey)
1052                 return -ENOMEM;
1053         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1054                                  newkey, cmd.tcpm_keylen);
1055 }
1056
1057 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1058                                         __be32 daddr, __be32 saddr, int nbytes)
1059 {
1060         struct tcp4_pseudohdr *bp;
1061         struct scatterlist sg;
1062
1063         bp = &hp->md5_blk.ip4;
1064
1065         /*
1066          * 1. the TCP pseudo-header (in the order: source IP address,
1067          * destination IP address, zero-padded protocol number, and
1068          * segment length)
1069          */
1070         bp->saddr = saddr;
1071         bp->daddr = daddr;
1072         bp->pad = 0;
1073         bp->protocol = IPPROTO_TCP;
1074         bp->len = cpu_to_be16(nbytes);
1075
1076         sg_init_one(&sg, bp, sizeof(*bp));
1077         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1078 }
1079
1080 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1081                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1082 {
1083         struct tcp_md5sig_pool *hp;
1084         struct hash_desc *desc;
1085
1086         hp = tcp_get_md5sig_pool();
1087         if (!hp)
1088                 goto clear_hash_noput;
1089         desc = &hp->md5_desc;
1090
1091         if (crypto_hash_init(desc))
1092                 goto clear_hash;
1093         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1094                 goto clear_hash;
1095         if (tcp_md5_hash_header(hp, th))
1096                 goto clear_hash;
1097         if (tcp_md5_hash_key(hp, key))
1098                 goto clear_hash;
1099         if (crypto_hash_final(desc, md5_hash))
1100                 goto clear_hash;
1101
1102         tcp_put_md5sig_pool();
1103         return 0;
1104
1105 clear_hash:
1106         tcp_put_md5sig_pool();
1107 clear_hash_noput:
1108         memset(md5_hash, 0, 16);
1109         return 1;
1110 }
1111
1112 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1113                         struct sock *sk, struct request_sock *req,
1114                         struct sk_buff *skb)
1115 {
1116         struct tcp_md5sig_pool *hp;
1117         struct hash_desc *desc;
1118         struct tcphdr *th = tcp_hdr(skb);
1119         __be32 saddr, daddr;
1120
1121         if (sk) {
1122                 saddr = inet_sk(sk)->inet_saddr;
1123                 daddr = inet_sk(sk)->inet_daddr;
1124         } else if (req) {
1125                 saddr = inet_rsk(req)->loc_addr;
1126                 daddr = inet_rsk(req)->rmt_addr;
1127         } else {
1128                 const struct iphdr *iph = ip_hdr(skb);
1129                 saddr = iph->saddr;
1130                 daddr = iph->daddr;
1131         }
1132
1133         hp = tcp_get_md5sig_pool();
1134         if (!hp)
1135                 goto clear_hash_noput;
1136         desc = &hp->md5_desc;
1137
1138         if (crypto_hash_init(desc))
1139                 goto clear_hash;
1140
1141         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1142                 goto clear_hash;
1143         if (tcp_md5_hash_header(hp, th))
1144                 goto clear_hash;
1145         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1146                 goto clear_hash;
1147         if (tcp_md5_hash_key(hp, key))
1148                 goto clear_hash;
1149         if (crypto_hash_final(desc, md5_hash))
1150                 goto clear_hash;
1151
1152         tcp_put_md5sig_pool();
1153         return 0;
1154
1155 clear_hash:
1156         tcp_put_md5sig_pool();
1157 clear_hash_noput:
1158         memset(md5_hash, 0, 16);
1159         return 1;
1160 }
1161 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1162
1163 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1164 {
1165         /*
1166          * This gets called for each TCP segment that arrives
1167          * so we want to be efficient.
1168          * We have 3 drop cases:
1169          * o No MD5 hash and one expected.
1170          * o MD5 hash and we're not expecting one.
1171          * o MD5 hash and its wrong.
1172          */
1173         __u8 *hash_location = NULL;
1174         struct tcp_md5sig_key *hash_expected;
1175         const struct iphdr *iph = ip_hdr(skb);
1176         struct tcphdr *th = tcp_hdr(skb);
1177         int genhash;
1178         unsigned char newhash[16];
1179
1180         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1181         hash_location = tcp_parse_md5sig_option(th);
1182
1183         /* We've parsed the options - do we have a hash? */
1184         if (!hash_expected && !hash_location)
1185                 return 0;
1186
1187         if (hash_expected && !hash_location) {
1188                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1189                 return 1;
1190         }
1191
1192         if (!hash_expected && hash_location) {
1193                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1194                 return 1;
1195         }
1196
1197         /* Okay, so this is hash_expected and hash_location -
1198          * so we need to calculate the checksum.
1199          */
1200         genhash = tcp_v4_md5_hash_skb(newhash,
1201                                       hash_expected,
1202                                       NULL, NULL, skb);
1203
1204         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1205                 if (net_ratelimit()) {
1206                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1207                                &iph->saddr, ntohs(th->source),
1208                                &iph->daddr, ntohs(th->dest),
1209                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1210                 }
1211                 return 1;
1212         }
1213         return 0;
1214 }
1215
1216 #endif
1217
1218 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1219         .family         =       PF_INET,
1220         .obj_size       =       sizeof(struct tcp_request_sock),
1221         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1222         .send_ack       =       tcp_v4_reqsk_send_ack,
1223         .destructor     =       tcp_v4_reqsk_destructor,
1224         .send_reset     =       tcp_v4_send_reset,
1225         .syn_ack_timeout =      tcp_syn_ack_timeout,
1226 };
1227
1228 #ifdef CONFIG_TCP_MD5SIG
1229 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1230         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1231         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1232 };
1233 #endif
1234
1235 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1236 {
1237         struct tcp_extend_values tmp_ext;
1238         struct tcp_options_received tmp_opt;
1239         u8 *hash_location;
1240         struct request_sock *req;
1241         struct inet_request_sock *ireq;
1242         struct tcp_sock *tp = tcp_sk(sk);
1243         struct dst_entry *dst = NULL;
1244         __be32 saddr = ip_hdr(skb)->saddr;
1245         __be32 daddr = ip_hdr(skb)->daddr;
1246         __u32 isn = TCP_SKB_CB(skb)->when;
1247 #ifdef CONFIG_SYN_COOKIES
1248         int want_cookie = 0;
1249 #else
1250 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1251 #endif
1252
1253         /* Never answer to SYNs send to broadcast or multicast */
1254         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1255                 goto drop;
1256
1257         /* TW buckets are converted to open requests without
1258          * limitations, they conserve resources and peer is
1259          * evidently real one.
1260          */
1261         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1262                 if (net_ratelimit())
1263                         syn_flood_warning(skb);
1264 #ifdef CONFIG_SYN_COOKIES
1265                 if (sysctl_tcp_syncookies) {
1266                         want_cookie = 1;
1267                 } else
1268 #endif
1269                 goto drop;
1270         }
1271
1272         /* Accept backlog is full. If we have already queued enough
1273          * of warm entries in syn queue, drop request. It is better than
1274          * clogging syn queue with openreqs with exponentially increasing
1275          * timeout.
1276          */
1277         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1278                 goto drop;
1279
1280         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1281         if (!req)
1282                 goto drop;
1283
1284 #ifdef CONFIG_TCP_MD5SIG
1285         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1286 #endif
1287
1288         tcp_clear_options(&tmp_opt);
1289         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1290         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1291         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1292
1293         if (tmp_opt.cookie_plus > 0 &&
1294             tmp_opt.saw_tstamp &&
1295             !tp->rx_opt.cookie_out_never &&
1296             (sysctl_tcp_cookie_size > 0 ||
1297              (tp->cookie_values != NULL &&
1298               tp->cookie_values->cookie_desired > 0))) {
1299                 u8 *c;
1300                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1301                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1302
1303                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1304                         goto drop_and_release;
1305
1306                 /* Secret recipe starts with IP addresses */
1307                 *mess++ ^= (__force u32)daddr;
1308                 *mess++ ^= (__force u32)saddr;
1309
1310                 /* plus variable length Initiator Cookie */
1311                 c = (u8 *)mess;
1312                 while (l-- > 0)
1313                         *c++ ^= *hash_location++;
1314
1315 #ifdef CONFIG_SYN_COOKIES
1316                 want_cookie = 0;        /* not our kind of cookie */
1317 #endif
1318                 tmp_ext.cookie_out_never = 0; /* false */
1319                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1320         } else if (!tp->rx_opt.cookie_in_always) {
1321                 /* redundant indications, but ensure initialization. */
1322                 tmp_ext.cookie_out_never = 1; /* true */
1323                 tmp_ext.cookie_plus = 0;
1324         } else {
1325                 goto drop_and_release;
1326         }
1327         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1328
1329         if (want_cookie && !tmp_opt.saw_tstamp)
1330                 tcp_clear_options(&tmp_opt);
1331
1332         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1333         tcp_openreq_init(req, &tmp_opt, skb);
1334
1335         ireq = inet_rsk(req);
1336         ireq->loc_addr = daddr;
1337         ireq->rmt_addr = saddr;
1338         ireq->no_srccheck = inet_sk(sk)->transparent;
1339         ireq->opt = tcp_v4_save_options(sk, skb);
1340
1341         if (security_inet_conn_request(sk, skb, req))
1342                 goto drop_and_free;
1343
1344         if (!want_cookie || tmp_opt.tstamp_ok)
1345                 TCP_ECN_create_request(req, tcp_hdr(skb));
1346
1347         if (want_cookie) {
1348                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1349                 req->cookie_ts = tmp_opt.tstamp_ok;
1350         } else if (!isn) {
1351                 struct inet_peer *peer = NULL;
1352                 struct flowi4 fl4;
1353
1354                 /* VJ's idea. We save last timestamp seen
1355                  * from the destination in peer table, when entering
1356                  * state TIME-WAIT, and check against it before
1357                  * accepting new connection request.
1358                  *
1359                  * If "isn" is not zero, this request hit alive
1360                  * timewait bucket, so that all the necessary checks
1361                  * are made in the function processing timewait state.
1362                  */
1363                 if (tmp_opt.saw_tstamp &&
1364                     tcp_death_row.sysctl_tw_recycle &&
1365                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1366                     fl4.daddr == saddr &&
1367                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1368                         inet_peer_refcheck(peer);
1369                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1370                             (s32)(peer->tcp_ts - req->ts_recent) >
1371                                                         TCP_PAWS_WINDOW) {
1372                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1373                                 goto drop_and_release;
1374                         }
1375                 }
1376                 /* Kill the following clause, if you dislike this way. */
1377                 else if (!sysctl_tcp_syncookies &&
1378                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1379                           (sysctl_max_syn_backlog >> 2)) &&
1380                          (!peer || !peer->tcp_ts_stamp) &&
1381                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1382                         /* Without syncookies last quarter of
1383                          * backlog is filled with destinations,
1384                          * proven to be alive.
1385                          * It means that we continue to communicate
1386                          * to destinations, already remembered
1387                          * to the moment of synflood.
1388                          */
1389                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1390                                        &saddr, ntohs(tcp_hdr(skb)->source));
1391                         goto drop_and_release;
1392                 }
1393
1394                 isn = tcp_v4_init_sequence(skb);
1395         }
1396         tcp_rsk(req)->snt_isn = isn;
1397
1398         if (tcp_v4_send_synack(sk, dst, req,
1399                                (struct request_values *)&tmp_ext) ||
1400             want_cookie)
1401                 goto drop_and_free;
1402
1403         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1404         return 0;
1405
1406 drop_and_release:
1407         dst_release(dst);
1408 drop_and_free:
1409         reqsk_free(req);
1410 drop:
1411         return 0;
1412 }
1413 EXPORT_SYMBOL(tcp_v4_conn_request);
1414
1415
1416 /*
1417  * The three way handshake has completed - we got a valid synack -
1418  * now create the new socket.
1419  */
1420 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1421                                   struct request_sock *req,
1422                                   struct dst_entry *dst)
1423 {
1424         struct inet_request_sock *ireq;
1425         struct inet_sock *newinet;
1426         struct tcp_sock *newtp;
1427         struct sock *newsk;
1428 #ifdef CONFIG_TCP_MD5SIG
1429         struct tcp_md5sig_key *key;
1430 #endif
1431         struct ip_options_rcu *inet_opt;
1432
1433         if (sk_acceptq_is_full(sk))
1434                 goto exit_overflow;
1435
1436         newsk = tcp_create_openreq_child(sk, req, skb);
1437         if (!newsk)
1438                 goto exit_nonewsk;
1439
1440         newsk->sk_gso_type = SKB_GSO_TCPV4;
1441
1442         newtp                 = tcp_sk(newsk);
1443         newinet               = inet_sk(newsk);
1444         ireq                  = inet_rsk(req);
1445         newinet->inet_daddr   = ireq->rmt_addr;
1446         newinet->inet_rcv_saddr = ireq->loc_addr;
1447         newinet->inet_saddr           = ireq->loc_addr;
1448         inet_opt              = ireq->opt;
1449         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1450         ireq->opt             = NULL;
1451         newinet->mc_index     = inet_iif(skb);
1452         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1453         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1454         if (inet_opt)
1455                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1456         newinet->inet_id = newtp->write_seq ^ jiffies;
1457
1458         if (!dst) {
1459                 dst = inet_csk_route_child_sock(sk, newsk, req);
1460                 if (!dst)
1461                         goto put_and_exit;
1462         } else {
1463                 /* syncookie case : see end of cookie_v4_check() */
1464         }
1465         sk_setup_caps(newsk, dst);
1466
1467         tcp_mtup_init(newsk);
1468         tcp_sync_mss(newsk, dst_mtu(dst));
1469         newtp->advmss = dst_metric_advmss(dst);
1470         if (tcp_sk(sk)->rx_opt.user_mss &&
1471             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1472                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1473
1474         tcp_initialize_rcv_mss(newsk);
1475
1476 #ifdef CONFIG_TCP_MD5SIG
1477         /* Copy over the MD5 key from the original socket */
1478         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1479         if (key != NULL) {
1480                 /*
1481                  * We're using one, so create a matching key
1482                  * on the newsk structure. If we fail to get
1483                  * memory, then we end up not copying the key
1484                  * across. Shucks.
1485                  */
1486                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1487                 if (newkey != NULL)
1488                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1489                                           newkey, key->keylen);
1490                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1491         }
1492 #endif
1493
1494         if (__inet_inherit_port(sk, newsk) < 0)
1495                 goto put_and_exit;
1496         __inet_hash_nolisten(newsk, NULL);
1497
1498         return newsk;
1499
1500 exit_overflow:
1501         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1502 exit_nonewsk:
1503         dst_release(dst);
1504 exit:
1505         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1506         return NULL;
1507 put_and_exit:
1508         sock_put(newsk);
1509         goto exit;
1510 }
1511 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1512
1513 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1514 {
1515         struct tcphdr *th = tcp_hdr(skb);
1516         const struct iphdr *iph = ip_hdr(skb);
1517         struct sock *nsk;
1518         struct request_sock **prev;
1519         /* Find possible connection requests. */
1520         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1521                                                        iph->saddr, iph->daddr);
1522         if (req)
1523                 return tcp_check_req(sk, skb, req, prev);
1524
1525         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1526                         th->source, iph->daddr, th->dest, inet_iif(skb));
1527
1528         if (nsk) {
1529                 if (nsk->sk_state != TCP_TIME_WAIT) {
1530                         bh_lock_sock(nsk);
1531                         return nsk;
1532                 }
1533                 inet_twsk_put(inet_twsk(nsk));
1534                 return NULL;
1535         }
1536
1537 #ifdef CONFIG_SYN_COOKIES
1538         if (!th->syn)
1539                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1540 #endif
1541         return sk;
1542 }
1543
1544 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1545 {
1546         const struct iphdr *iph = ip_hdr(skb);
1547
1548         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1549                 if (!tcp_v4_check(skb->len, iph->saddr,
1550                                   iph->daddr, skb->csum)) {
1551                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1552                         return 0;
1553                 }
1554         }
1555
1556         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1557                                        skb->len, IPPROTO_TCP, 0);
1558
1559         if (skb->len <= 76) {
1560                 return __skb_checksum_complete(skb);
1561         }
1562         return 0;
1563 }
1564
1565
1566 /* The socket must have it's spinlock held when we get
1567  * here.
1568  *
1569  * We have a potential double-lock case here, so even when
1570  * doing backlog processing we use the BH locking scheme.
1571  * This is because we cannot sleep with the original spinlock
1572  * held.
1573  */
1574 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1575 {
1576         struct sock *rsk;
1577 #ifdef CONFIG_TCP_MD5SIG
1578         /*
1579          * We really want to reject the packet as early as possible
1580          * if:
1581          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1582          *  o There is an MD5 option and we're not expecting one
1583          */
1584         if (tcp_v4_inbound_md5_hash(sk, skb))
1585                 goto discard;
1586 #endif
1587
1588         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1589                 sock_rps_save_rxhash(sk, skb->rxhash);
1590                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1591                         rsk = sk;
1592                         goto reset;
1593                 }
1594                 return 0;
1595         }
1596
1597         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1598                 goto csum_err;
1599
1600         if (sk->sk_state == TCP_LISTEN) {
1601                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1602                 if (!nsk)
1603                         goto discard;
1604
1605                 if (nsk != sk) {
1606                         sock_rps_save_rxhash(nsk, skb->rxhash);
1607                         if (tcp_child_process(sk, nsk, skb)) {
1608                                 rsk = nsk;
1609                                 goto reset;
1610                         }
1611                         return 0;
1612                 }
1613         } else
1614                 sock_rps_save_rxhash(sk, skb->rxhash);
1615
1616         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1617                 rsk = sk;
1618                 goto reset;
1619         }
1620         return 0;
1621
1622 reset:
1623         tcp_v4_send_reset(rsk, skb);
1624 discard:
1625         kfree_skb(skb);
1626         /* Be careful here. If this function gets more complicated and
1627          * gcc suffers from register pressure on the x86, sk (in %ebx)
1628          * might be destroyed here. This current version compiles correctly,
1629          * but you have been warned.
1630          */
1631         return 0;
1632
1633 csum_err:
1634         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1635         goto discard;
1636 }
1637 EXPORT_SYMBOL(tcp_v4_do_rcv);
1638
1639 /*
1640  *      From tcp_input.c
1641  */
1642
1643 int tcp_v4_rcv(struct sk_buff *skb)
1644 {
1645         const struct iphdr *iph;
1646         struct tcphdr *th;
1647         struct sock *sk;
1648         int ret;
1649         struct net *net = dev_net(skb->dev);
1650
1651         if (skb->pkt_type != PACKET_HOST)
1652                 goto discard_it;
1653
1654         /* Count it even if it's bad */
1655         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1656
1657         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1658                 goto discard_it;
1659
1660         th = tcp_hdr(skb);
1661
1662         if (th->doff < sizeof(struct tcphdr) / 4)
1663                 goto bad_packet;
1664         if (!pskb_may_pull(skb, th->doff * 4))
1665                 goto discard_it;
1666
1667         /* An explanation is required here, I think.
1668          * Packet length and doff are validated by header prediction,
1669          * provided case of th->doff==0 is eliminated.
1670          * So, we defer the checks. */
1671         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1672                 goto bad_packet;
1673
1674         th = tcp_hdr(skb);
1675         iph = ip_hdr(skb);
1676         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1677         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1678                                     skb->len - th->doff * 4);
1679         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1680         TCP_SKB_CB(skb)->when    = 0;
1681         TCP_SKB_CB(skb)->flags   = iph->tos;
1682         TCP_SKB_CB(skb)->sacked  = 0;
1683
1684         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1685         if (!sk)
1686                 goto no_tcp_socket;
1687
1688 process:
1689         if (sk->sk_state == TCP_TIME_WAIT)
1690                 goto do_time_wait;
1691
1692         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1693                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1694                 goto discard_and_relse;
1695         }
1696
1697         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1698                 goto discard_and_relse;
1699         nf_reset(skb);
1700
1701         if (sk_filter(sk, skb))
1702                 goto discard_and_relse;
1703
1704         skb->dev = NULL;
1705
1706         bh_lock_sock_nested(sk);
1707         ret = 0;
1708         if (!sock_owned_by_user(sk)) {
1709 #ifdef CONFIG_NET_DMA
1710                 struct tcp_sock *tp = tcp_sk(sk);
1711                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1712                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1713                 if (tp->ucopy.dma_chan)
1714                         ret = tcp_v4_do_rcv(sk, skb);
1715                 else
1716 #endif
1717                 {
1718                         if (!tcp_prequeue(sk, skb))
1719                                 ret = tcp_v4_do_rcv(sk, skb);
1720                 }
1721         } else if (unlikely(sk_add_backlog(sk, skb))) {
1722                 bh_unlock_sock(sk);
1723                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1724                 goto discard_and_relse;
1725         }
1726         bh_unlock_sock(sk);
1727
1728         sock_put(sk);
1729
1730         return ret;
1731
1732 no_tcp_socket:
1733         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1734                 goto discard_it;
1735
1736         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1737 bad_packet:
1738                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1739         } else {
1740                 tcp_v4_send_reset(NULL, skb);
1741         }
1742
1743 discard_it:
1744         /* Discard frame. */
1745         kfree_skb(skb);
1746         return 0;
1747
1748 discard_and_relse:
1749         sock_put(sk);
1750         goto discard_it;
1751
1752 do_time_wait:
1753         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1754                 inet_twsk_put(inet_twsk(sk));
1755                 goto discard_it;
1756         }
1757
1758         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1759                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1760                 inet_twsk_put(inet_twsk(sk));
1761                 goto discard_it;
1762         }
1763         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1764         case TCP_TW_SYN: {
1765                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1766                                                         &tcp_hashinfo,
1767                                                         iph->daddr, th->dest,
1768                                                         inet_iif(skb));
1769                 if (sk2) {
1770                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1771                         inet_twsk_put(inet_twsk(sk));
1772                         sk = sk2;
1773                         goto process;
1774                 }
1775                 /* Fall through to ACK */
1776         }
1777         case TCP_TW_ACK:
1778                 tcp_v4_timewait_ack(sk, skb);
1779                 break;
1780         case TCP_TW_RST:
1781                 goto no_tcp_socket;
1782         case TCP_TW_SUCCESS:;
1783         }
1784         goto discard_it;
1785 }
1786
1787 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1788 {
1789         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1790         struct inet_sock *inet = inet_sk(sk);
1791         struct inet_peer *peer;
1792
1793         if (!rt ||
1794             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1795                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1796                 *release_it = true;
1797         } else {
1798                 if (!rt->peer)
1799                         rt_bind_peer(rt, inet->inet_daddr, 1);
1800                 peer = rt->peer;
1801                 *release_it = false;
1802         }
1803
1804         return peer;
1805 }
1806 EXPORT_SYMBOL(tcp_v4_get_peer);
1807
1808 void *tcp_v4_tw_get_peer(struct sock *sk)
1809 {
1810         struct inet_timewait_sock *tw = inet_twsk(sk);
1811
1812         return inet_getpeer_v4(tw->tw_daddr, 1);
1813 }
1814 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1815
1816 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1817         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1818         .twsk_unique    = tcp_twsk_unique,
1819         .twsk_destructor= tcp_twsk_destructor,
1820         .twsk_getpeer   = tcp_v4_tw_get_peer,
1821 };
1822
1823 const struct inet_connection_sock_af_ops ipv4_specific = {
1824         .queue_xmit        = ip_queue_xmit,
1825         .send_check        = tcp_v4_send_check,
1826         .rebuild_header    = inet_sk_rebuild_header,
1827         .conn_request      = tcp_v4_conn_request,
1828         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1829         .get_peer          = tcp_v4_get_peer,
1830         .net_header_len    = sizeof(struct iphdr),
1831         .setsockopt        = ip_setsockopt,
1832         .getsockopt        = ip_getsockopt,
1833         .addr2sockaddr     = inet_csk_addr2sockaddr,
1834         .sockaddr_len      = sizeof(struct sockaddr_in),
1835         .bind_conflict     = inet_csk_bind_conflict,
1836 #ifdef CONFIG_COMPAT
1837         .compat_setsockopt = compat_ip_setsockopt,
1838         .compat_getsockopt = compat_ip_getsockopt,
1839 #endif
1840 };
1841 EXPORT_SYMBOL(ipv4_specific);
1842
1843 #ifdef CONFIG_TCP_MD5SIG
1844 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1845         .md5_lookup             = tcp_v4_md5_lookup,
1846         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1847         .md5_add                = tcp_v4_md5_add_func,
1848         .md5_parse              = tcp_v4_parse_md5_keys,
1849 };
1850 #endif
1851
1852 /* NOTE: A lot of things set to zero explicitly by call to
1853  *       sk_alloc() so need not be done here.
1854  */
1855 static int tcp_v4_init_sock(struct sock *sk)
1856 {
1857         struct inet_connection_sock *icsk = inet_csk(sk);
1858         struct tcp_sock *tp = tcp_sk(sk);
1859
1860         skb_queue_head_init(&tp->out_of_order_queue);
1861         tcp_init_xmit_timers(sk);
1862         tcp_prequeue_init(tp);
1863
1864         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1865         tp->mdev = TCP_TIMEOUT_INIT;
1866
1867         /* So many TCP implementations out there (incorrectly) count the
1868          * initial SYN frame in their delayed-ACK and congestion control
1869          * algorithms that we must have the following bandaid to talk
1870          * efficiently to them.  -DaveM
1871          */
1872         tp->snd_cwnd = 2;
1873
1874         /* See draft-stevens-tcpca-spec-01 for discussion of the
1875          * initialization of these values.
1876          */
1877         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1878         tp->snd_cwnd_clamp = ~0;
1879         tp->mss_cache = TCP_MSS_DEFAULT;
1880
1881         tp->reordering = sysctl_tcp_reordering;
1882         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1883
1884         sk->sk_state = TCP_CLOSE;
1885
1886         sk->sk_write_space = sk_stream_write_space;
1887         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1888
1889         icsk->icsk_af_ops = &ipv4_specific;
1890         icsk->icsk_sync_mss = tcp_sync_mss;
1891 #ifdef CONFIG_TCP_MD5SIG
1892         tp->af_specific = &tcp_sock_ipv4_specific;
1893 #endif
1894
1895         /* TCP Cookie Transactions */
1896         if (sysctl_tcp_cookie_size > 0) {
1897                 /* Default, cookies without s_data_payload. */
1898                 tp->cookie_values =
1899                         kzalloc(sizeof(*tp->cookie_values),
1900                                 sk->sk_allocation);
1901                 if (tp->cookie_values != NULL)
1902                         kref_init(&tp->cookie_values->kref);
1903         }
1904         /* Presumed zeroed, in order of appearance:
1905          *      cookie_in_always, cookie_out_never,
1906          *      s_data_constant, s_data_in, s_data_out
1907          */
1908         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1909         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1910
1911         local_bh_disable();
1912         percpu_counter_inc(&tcp_sockets_allocated);
1913         local_bh_enable();
1914
1915         return 0;
1916 }
1917
1918 void tcp_v4_destroy_sock(struct sock *sk)
1919 {
1920         struct tcp_sock *tp = tcp_sk(sk);
1921
1922         tcp_clear_xmit_timers(sk);
1923
1924         tcp_cleanup_congestion_control(sk);
1925
1926         /* Cleanup up the write buffer. */
1927         tcp_write_queue_purge(sk);
1928
1929         /* Cleans up our, hopefully empty, out_of_order_queue. */
1930         __skb_queue_purge(&tp->out_of_order_queue);
1931
1932 #ifdef CONFIG_TCP_MD5SIG
1933         /* Clean up the MD5 key list, if any */
1934         if (tp->md5sig_info) {
1935                 tcp_v4_clear_md5_list(sk);
1936                 kfree(tp->md5sig_info);
1937                 tp->md5sig_info = NULL;
1938         }
1939 #endif
1940
1941 #ifdef CONFIG_NET_DMA
1942         /* Cleans up our sk_async_wait_queue */
1943         __skb_queue_purge(&sk->sk_async_wait_queue);
1944 #endif
1945
1946         /* Clean prequeue, it must be empty really */
1947         __skb_queue_purge(&tp->ucopy.prequeue);
1948
1949         /* Clean up a referenced TCP bind bucket. */
1950         if (inet_csk(sk)->icsk_bind_hash)
1951                 inet_put_port(sk);
1952
1953         /*
1954          * If sendmsg cached page exists, toss it.
1955          */
1956         if (sk->sk_sndmsg_page) {
1957                 __free_page(sk->sk_sndmsg_page);
1958                 sk->sk_sndmsg_page = NULL;
1959         }
1960
1961         /* TCP Cookie Transactions */
1962         if (tp->cookie_values != NULL) {
1963                 kref_put(&tp->cookie_values->kref,
1964                          tcp_cookie_values_release);
1965                 tp->cookie_values = NULL;
1966         }
1967
1968         percpu_counter_dec(&tcp_sockets_allocated);
1969 }
1970 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1971
1972 #ifdef CONFIG_PROC_FS
1973 /* Proc filesystem TCP sock list dumping. */
1974
1975 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1976 {
1977         return hlist_nulls_empty(head) ? NULL :
1978                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1979 }
1980
1981 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1982 {
1983         return !is_a_nulls(tw->tw_node.next) ?
1984                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1985 }
1986
1987 /*
1988  * Get next listener socket follow cur.  If cur is NULL, get first socket
1989  * starting from bucket given in st->bucket; when st->bucket is zero the
1990  * very first socket in the hash table is returned.
1991  */
1992 static void *listening_get_next(struct seq_file *seq, void *cur)
1993 {
1994         struct inet_connection_sock *icsk;
1995         struct hlist_nulls_node *node;
1996         struct sock *sk = cur;
1997         struct inet_listen_hashbucket *ilb;
1998         struct tcp_iter_state *st = seq->private;
1999         struct net *net = seq_file_net(seq);
2000
2001         if (!sk) {
2002                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2003                 spin_lock_bh(&ilb->lock);
2004                 sk = sk_nulls_head(&ilb->head);
2005                 st->offset = 0;
2006                 goto get_sk;
2007         }
2008         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2009         ++st->num;
2010         ++st->offset;
2011
2012         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2013                 struct request_sock *req = cur;
2014
2015                 icsk = inet_csk(st->syn_wait_sk);
2016                 req = req->dl_next;
2017                 while (1) {
2018                         while (req) {
2019                                 if (req->rsk_ops->family == st->family) {
2020                                         cur = req;
2021                                         goto out;
2022                                 }
2023                                 req = req->dl_next;
2024                         }
2025                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2026                                 break;
2027 get_req:
2028                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2029                 }
2030                 sk        = sk_nulls_next(st->syn_wait_sk);
2031                 st->state = TCP_SEQ_STATE_LISTENING;
2032                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033         } else {
2034                 icsk = inet_csk(sk);
2035                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2037                         goto start_req;
2038                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2039                 sk = sk_nulls_next(sk);
2040         }
2041 get_sk:
2042         sk_nulls_for_each_from(sk, node) {
2043                 if (!net_eq(sock_net(sk), net))
2044                         continue;
2045                 if (sk->sk_family == st->family) {
2046                         cur = sk;
2047                         goto out;
2048                 }
2049                 icsk = inet_csk(sk);
2050                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2051                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2052 start_req:
2053                         st->uid         = sock_i_uid(sk);
2054                         st->syn_wait_sk = sk;
2055                         st->state       = TCP_SEQ_STATE_OPENREQ;
2056                         st->sbucket     = 0;
2057                         goto get_req;
2058                 }
2059                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2060         }
2061         spin_unlock_bh(&ilb->lock);
2062         st->offset = 0;
2063         if (++st->bucket < INET_LHTABLE_SIZE) {
2064                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2065                 spin_lock_bh(&ilb->lock);
2066                 sk = sk_nulls_head(&ilb->head);
2067                 goto get_sk;
2068         }
2069         cur = NULL;
2070 out:
2071         return cur;
2072 }
2073
2074 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2075 {
2076         struct tcp_iter_state *st = seq->private;
2077         void *rc;
2078
2079         st->bucket = 0;
2080         st->offset = 0;
2081         rc = listening_get_next(seq, NULL);
2082
2083         while (rc && *pos) {
2084                 rc = listening_get_next(seq, rc);
2085                 --*pos;
2086         }
2087         return rc;
2088 }
2089
2090 static inline int empty_bucket(struct tcp_iter_state *st)
2091 {
2092         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2093                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2094 }
2095
2096 /*
2097  * Get first established socket starting from bucket given in st->bucket.
2098  * If st->bucket is zero, the very first socket in the hash is returned.
2099  */
2100 static void *established_get_first(struct seq_file *seq)
2101 {
2102         struct tcp_iter_state *st = seq->private;
2103         struct net *net = seq_file_net(seq);
2104         void *rc = NULL;
2105
2106         st->offset = 0;
2107         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2108                 struct sock *sk;
2109                 struct hlist_nulls_node *node;
2110                 struct inet_timewait_sock *tw;
2111                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2112
2113                 /* Lockless fast path for the common case of empty buckets */
2114                 if (empty_bucket(st))
2115                         continue;
2116
2117                 spin_lock_bh(lock);
2118                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2119                         if (sk->sk_family != st->family ||
2120                             !net_eq(sock_net(sk), net)) {
2121                                 continue;
2122                         }
2123                         rc = sk;
2124                         goto out;
2125                 }
2126                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2127                 inet_twsk_for_each(tw, node,
2128                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2129                         if (tw->tw_family != st->family ||
2130                             !net_eq(twsk_net(tw), net)) {
2131                                 continue;
2132                         }
2133                         rc = tw;
2134                         goto out;
2135                 }
2136                 spin_unlock_bh(lock);
2137                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2138         }
2139 out:
2140         return rc;
2141 }
2142
2143 static void *established_get_next(struct seq_file *seq, void *cur)
2144 {
2145         struct sock *sk = cur;
2146         struct inet_timewait_sock *tw;
2147         struct hlist_nulls_node *node;
2148         struct tcp_iter_state *st = seq->private;
2149         struct net *net = seq_file_net(seq);
2150
2151         ++st->num;
2152         ++st->offset;
2153
2154         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2155                 tw = cur;
2156                 tw = tw_next(tw);
2157 get_tw:
2158                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2159                         tw = tw_next(tw);
2160                 }
2161                 if (tw) {
2162                         cur = tw;
2163                         goto out;
2164                 }
2165                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2166                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2167
2168                 /* Look for next non empty bucket */
2169                 st->offset = 0;
2170                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2171                                 empty_bucket(st))
2172                         ;
2173                 if (st->bucket > tcp_hashinfo.ehash_mask)
2174                         return NULL;
2175
2176                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2177                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2178         } else
2179                 sk = sk_nulls_next(sk);
2180
2181         sk_nulls_for_each_from(sk, node) {
2182                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2183                         goto found;
2184         }
2185
2186         st->state = TCP_SEQ_STATE_TIME_WAIT;
2187         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2188         goto get_tw;
2189 found:
2190         cur = sk;
2191 out:
2192         return cur;
2193 }
2194
2195 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2196 {
2197         struct tcp_iter_state *st = seq->private;
2198         void *rc;
2199
2200         st->bucket = 0;
2201         rc = established_get_first(seq);
2202
2203         while (rc && pos) {
2204                 rc = established_get_next(seq, rc);
2205                 --pos;
2206         }
2207         return rc;
2208 }
2209
2210 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2211 {
2212         void *rc;
2213         struct tcp_iter_state *st = seq->private;
2214
2215         st->state = TCP_SEQ_STATE_LISTENING;
2216         rc        = listening_get_idx(seq, &pos);
2217
2218         if (!rc) {
2219                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2220                 rc        = established_get_idx(seq, pos);
2221         }
2222
2223         return rc;
2224 }
2225
2226 static void *tcp_seek_last_pos(struct seq_file *seq)
2227 {
2228         struct tcp_iter_state *st = seq->private;
2229         int offset = st->offset;
2230         int orig_num = st->num;
2231         void *rc = NULL;
2232
2233         switch (st->state) {
2234         case TCP_SEQ_STATE_OPENREQ:
2235         case TCP_SEQ_STATE_LISTENING:
2236                 if (st->bucket >= INET_LHTABLE_SIZE)
2237                         break;
2238                 st->state = TCP_SEQ_STATE_LISTENING;
2239                 rc = listening_get_next(seq, NULL);
2240                 while (offset-- && rc)
2241                         rc = listening_get_next(seq, rc);
2242                 if (rc)
2243                         break;
2244                 st->bucket = 0;
2245                 /* Fallthrough */
2246         case TCP_SEQ_STATE_ESTABLISHED:
2247         case TCP_SEQ_STATE_TIME_WAIT:
2248                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2249                 if (st->bucket > tcp_hashinfo.ehash_mask)
2250                         break;
2251                 rc = established_get_first(seq);
2252                 while (offset-- && rc)
2253                         rc = established_get_next(seq, rc);
2254         }
2255
2256         st->num = orig_num;
2257
2258         return rc;
2259 }
2260
2261 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2262 {
2263         struct tcp_iter_state *st = seq->private;
2264         void *rc;
2265
2266         if (*pos && *pos == st->last_pos) {
2267                 rc = tcp_seek_last_pos(seq);
2268                 if (rc)
2269                         goto out;
2270         }
2271
2272         st->state = TCP_SEQ_STATE_LISTENING;
2273         st->num = 0;
2274         st->bucket = 0;
2275         st->offset = 0;
2276         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2277
2278 out:
2279         st->last_pos = *pos;
2280         return rc;
2281 }
2282
2283 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2284 {
2285         struct tcp_iter_state *st = seq->private;
2286         void *rc = NULL;
2287
2288         if (v == SEQ_START_TOKEN) {
2289                 rc = tcp_get_idx(seq, 0);
2290                 goto out;
2291         }
2292
2293         switch (st->state) {
2294         case TCP_SEQ_STATE_OPENREQ:
2295         case TCP_SEQ_STATE_LISTENING:
2296                 rc = listening_get_next(seq, v);
2297                 if (!rc) {
2298                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2299                         st->bucket = 0;
2300                         st->offset = 0;
2301                         rc        = established_get_first(seq);
2302                 }
2303                 break;
2304         case TCP_SEQ_STATE_ESTABLISHED:
2305         case TCP_SEQ_STATE_TIME_WAIT:
2306                 rc = established_get_next(seq, v);
2307                 break;
2308         }
2309 out:
2310         ++*pos;
2311         st->last_pos = *pos;
2312         return rc;
2313 }
2314
2315 static void tcp_seq_stop(struct seq_file *seq, void *v)
2316 {
2317         struct tcp_iter_state *st = seq->private;
2318
2319         switch (st->state) {
2320         case TCP_SEQ_STATE_OPENREQ:
2321                 if (v) {
2322                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2323                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2324                 }
2325         case TCP_SEQ_STATE_LISTENING:
2326                 if (v != SEQ_START_TOKEN)
2327                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2328                 break;
2329         case TCP_SEQ_STATE_TIME_WAIT:
2330         case TCP_SEQ_STATE_ESTABLISHED:
2331                 if (v)
2332                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2333                 break;
2334         }
2335 }
2336
2337 static int tcp_seq_open(struct inode *inode, struct file *file)
2338 {
2339         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2340         struct tcp_iter_state *s;
2341         int err;
2342
2343         err = seq_open_net(inode, file, &afinfo->seq_ops,
2344                           sizeof(struct tcp_iter_state));
2345         if (err < 0)
2346                 return err;
2347
2348         s = ((struct seq_file *)file->private_data)->private;
2349         s->family               = afinfo->family;
2350         s->last_pos             = 0;
2351         return 0;
2352 }
2353
2354 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2355 {
2356         int rc = 0;
2357         struct proc_dir_entry *p;
2358
2359         afinfo->seq_fops.open           = tcp_seq_open;
2360         afinfo->seq_fops.read           = seq_read;
2361         afinfo->seq_fops.llseek         = seq_lseek;
2362         afinfo->seq_fops.release        = seq_release_net;
2363
2364         afinfo->seq_ops.start           = tcp_seq_start;
2365         afinfo->seq_ops.next            = tcp_seq_next;
2366         afinfo->seq_ops.stop            = tcp_seq_stop;
2367
2368         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2369                              &afinfo->seq_fops, afinfo);
2370         if (!p)
2371                 rc = -ENOMEM;
2372         return rc;
2373 }
2374 EXPORT_SYMBOL(tcp_proc_register);
2375
2376 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2377 {
2378         proc_net_remove(net, afinfo->name);
2379 }
2380 EXPORT_SYMBOL(tcp_proc_unregister);
2381
2382 static void get_openreq4(struct sock *sk, struct request_sock *req,
2383                          struct seq_file *f, int i, int uid, int *len)
2384 {
2385         const struct inet_request_sock *ireq = inet_rsk(req);
2386         int ttd = req->expires - jiffies;
2387
2388         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2389                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2390                 i,
2391                 ireq->loc_addr,
2392                 ntohs(inet_sk(sk)->inet_sport),
2393                 ireq->rmt_addr,
2394                 ntohs(ireq->rmt_port),
2395                 TCP_SYN_RECV,
2396                 0, 0, /* could print option size, but that is af dependent. */
2397                 1,    /* timers active (only the expire timer) */
2398                 jiffies_to_clock_t(ttd),
2399                 req->retrans,
2400                 uid,
2401                 0,  /* non standard timer */
2402                 0, /* open_requests have no inode */
2403                 atomic_read(&sk->sk_refcnt),
2404                 req,
2405                 len);
2406 }
2407
2408 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2409 {
2410         int timer_active;
2411         unsigned long timer_expires;
2412         struct tcp_sock *tp = tcp_sk(sk);
2413         const struct inet_connection_sock *icsk = inet_csk(sk);
2414         struct inet_sock *inet = inet_sk(sk);
2415         __be32 dest = inet->inet_daddr;
2416         __be32 src = inet->inet_rcv_saddr;
2417         __u16 destp = ntohs(inet->inet_dport);
2418         __u16 srcp = ntohs(inet->inet_sport);
2419         int rx_queue;
2420
2421         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2422                 timer_active    = 1;
2423                 timer_expires   = icsk->icsk_timeout;
2424         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2425                 timer_active    = 4;
2426                 timer_expires   = icsk->icsk_timeout;
2427         } else if (timer_pending(&sk->sk_timer)) {
2428                 timer_active    = 2;
2429                 timer_expires   = sk->sk_timer.expires;
2430         } else {
2431                 timer_active    = 0;
2432                 timer_expires = jiffies;
2433         }
2434
2435         if (sk->sk_state == TCP_LISTEN)
2436                 rx_queue = sk->sk_ack_backlog;
2437         else
2438                 /*
2439                  * because we dont lock socket, we might find a transient negative value
2440                  */
2441                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2442
2443         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2444                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2445                 i, src, srcp, dest, destp, sk->sk_state,
2446                 tp->write_seq - tp->snd_una,
2447                 rx_queue,
2448                 timer_active,
2449                 jiffies_to_clock_t(timer_expires - jiffies),
2450                 icsk->icsk_retransmits,
2451                 sock_i_uid(sk),
2452                 icsk->icsk_probes_out,
2453                 sock_i_ino(sk),
2454                 atomic_read(&sk->sk_refcnt), sk,
2455                 jiffies_to_clock_t(icsk->icsk_rto),
2456                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2457                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2458                 tp->snd_cwnd,
2459                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2460                 len);
2461 }
2462
2463 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2464                                struct seq_file *f, int i, int *len)
2465 {
2466         __be32 dest, src;
2467         __u16 destp, srcp;
2468         int ttd = tw->tw_ttd - jiffies;
2469
2470         if (ttd < 0)
2471                 ttd = 0;
2472
2473         dest  = tw->tw_daddr;
2474         src   = tw->tw_rcv_saddr;
2475         destp = ntohs(tw->tw_dport);
2476         srcp  = ntohs(tw->tw_sport);
2477
2478         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2479                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2480                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2481                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2482                 atomic_read(&tw->tw_refcnt), tw, len);
2483 }
2484
2485 #define TMPSZ 150
2486
2487 static int tcp4_seq_show(struct seq_file *seq, void *v)
2488 {
2489         struct tcp_iter_state *st;
2490         int len;
2491
2492         if (v == SEQ_START_TOKEN) {
2493                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2494                            "  sl  local_address rem_address   st tx_queue "
2495                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2496                            "inode");
2497                 goto out;
2498         }
2499         st = seq->private;
2500
2501         switch (st->state) {
2502         case TCP_SEQ_STATE_LISTENING:
2503         case TCP_SEQ_STATE_ESTABLISHED:
2504                 get_tcp4_sock(v, seq, st->num, &len);
2505                 break;
2506         case TCP_SEQ_STATE_OPENREQ:
2507                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2508                 break;
2509         case TCP_SEQ_STATE_TIME_WAIT:
2510                 get_timewait4_sock(v, seq, st->num, &len);
2511                 break;
2512         }
2513         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2514 out:
2515         return 0;
2516 }
2517
2518 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2519         .name           = "tcp",
2520         .family         = AF_INET,
2521         .seq_fops       = {
2522                 .owner          = THIS_MODULE,
2523         },
2524         .seq_ops        = {
2525                 .show           = tcp4_seq_show,
2526         },
2527 };
2528
2529 static int __net_init tcp4_proc_init_net(struct net *net)
2530 {
2531         return tcp_proc_register(net, &tcp4_seq_afinfo);
2532 }
2533
2534 static void __net_exit tcp4_proc_exit_net(struct net *net)
2535 {
2536         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2537 }
2538
2539 static struct pernet_operations tcp4_net_ops = {
2540         .init = tcp4_proc_init_net,
2541         .exit = tcp4_proc_exit_net,
2542 };
2543
2544 int __init tcp4_proc_init(void)
2545 {
2546         return register_pernet_subsys(&tcp4_net_ops);
2547 }
2548
2549 void tcp4_proc_exit(void)
2550 {
2551         unregister_pernet_subsys(&tcp4_net_ops);
2552 }
2553 #endif /* CONFIG_PROC_FS */
2554
2555 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2556 {
2557         const struct iphdr *iph = skb_gro_network_header(skb);
2558
2559         switch (skb->ip_summed) {
2560         case CHECKSUM_COMPLETE:
2561                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2562                                   skb->csum)) {
2563                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2564                         break;
2565                 }
2566
2567                 /* fall through */
2568         case CHECKSUM_NONE:
2569                 NAPI_GRO_CB(skb)->flush = 1;
2570                 return NULL;
2571         }
2572
2573         return tcp_gro_receive(head, skb);
2574 }
2575
2576 int tcp4_gro_complete(struct sk_buff *skb)
2577 {
2578         const struct iphdr *iph = ip_hdr(skb);
2579         struct tcphdr *th = tcp_hdr(skb);
2580
2581         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2582                                   iph->saddr, iph->daddr, 0);
2583         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2584
2585         return tcp_gro_complete(skb);
2586 }
2587
2588 struct proto tcp_prot = {
2589         .name                   = "TCP",
2590         .owner                  = THIS_MODULE,
2591         .close                  = tcp_close,
2592         .connect                = tcp_v4_connect,
2593         .disconnect             = tcp_disconnect,
2594         .accept                 = inet_csk_accept,
2595         .ioctl                  = tcp_ioctl,
2596         .init                   = tcp_v4_init_sock,
2597         .destroy                = tcp_v4_destroy_sock,
2598         .shutdown               = tcp_shutdown,
2599         .setsockopt             = tcp_setsockopt,
2600         .getsockopt             = tcp_getsockopt,
2601         .recvmsg                = tcp_recvmsg,
2602         .sendmsg                = tcp_sendmsg,
2603         .sendpage               = tcp_sendpage,
2604         .backlog_rcv            = tcp_v4_do_rcv,
2605         .hash                   = inet_hash,
2606         .unhash                 = inet_unhash,
2607         .get_port               = inet_csk_get_port,
2608         .enter_memory_pressure  = tcp_enter_memory_pressure,
2609         .sockets_allocated      = &tcp_sockets_allocated,
2610         .orphan_count           = &tcp_orphan_count,
2611         .memory_allocated       = &tcp_memory_allocated,
2612         .memory_pressure        = &tcp_memory_pressure,
2613         .sysctl_mem             = sysctl_tcp_mem,
2614         .sysctl_wmem            = sysctl_tcp_wmem,
2615         .sysctl_rmem            = sysctl_tcp_rmem,
2616         .max_header             = MAX_TCP_HEADER,
2617         .obj_size               = sizeof(struct tcp_sock),
2618         .slab_flags             = SLAB_DESTROY_BY_RCU,
2619         .twsk_prot              = &tcp_timewait_sock_ops,
2620         .rsk_prot               = &tcp_request_sock_ops,
2621         .h.hashinfo             = &tcp_hashinfo,
2622         .no_autobind            = true,
2623 #ifdef CONFIG_COMPAT
2624         .compat_setsockopt      = compat_tcp_setsockopt,
2625         .compat_getsockopt      = compat_tcp_getsockopt,
2626 #endif
2627 };
2628 EXPORT_SYMBOL(tcp_prot);
2629
2630
2631 static int __net_init tcp_sk_init(struct net *net)
2632 {
2633         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2634                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2635 }
2636
2637 static void __net_exit tcp_sk_exit(struct net *net)
2638 {
2639         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2640 }
2641
2642 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2643 {
2644         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2645 }
2646
2647 static struct pernet_operations __net_initdata tcp_sk_ops = {
2648        .init       = tcp_sk_init,
2649        .exit       = tcp_sk_exit,
2650        .exit_batch = tcp_sk_exit_batch,
2651 };
2652
2653 void __init tcp_v4_init(void)
2654 {
2655         inet_hashinfo_init(&tcp_hashinfo);
2656         if (register_pernet_subsys(&tcp_sk_ops))
2657                 panic("Failed to create the TCP control socket.\n");
2658 }