net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 static int tcp_repair_connect(struct sock *sk)
 142 {
 143         tcp_connect_init(sk);
 144         tcp_finish_connect(sk, NULL);
 145
 146         return 0;
 147 }
 148
 149 /* This will initiate an outgoing connection. */
 150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151 {
 152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153         struct inet_sock *inet = inet_sk(sk);
 154         struct tcp_sock *tp = tcp_sk(sk);
 155         __be16 orig_sport, orig_dport;
 156         __be32 daddr, nexthop;
 157         struct flowi4 *fl4;
 158         struct rtable *rt;
 159         int err;
 160         struct ip_options_rcu *inet_opt;
 161
 162         if (addr_len < sizeof(struct sockaddr_in))
 163                 return -EINVAL;
 164
 165         if (usin->sin_family != AF_INET)
 166                 return -EAFNOSUPPORT;
 167
 168         nexthop = daddr = usin->sin_addr.s_addr;
 169         inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                              sock_owned_by_user(sk));
 171         if (inet_opt && inet_opt->opt.srr) {
 172                 if (!daddr)
 173                         return -EINVAL;
 174                 nexthop = inet_opt->opt.faddr;
 175         }
 176
 177         orig_sport = inet->inet_sport;
 178         orig_dport = usin->sin_port;
 179         fl4 = &inet->cork.fl.u.ip4;
 180         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                               IPPROTO_TCP,
 183                               orig_sport, orig_dport, sk, true);
 184         if (IS_ERR(rt)) {
 185                 err = PTR_ERR(rt);
 186                 if (err == -ENETUNREACH)
 187                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                 return err;
 189         }
 190
 191         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                 ip_rt_put(rt);
 193                 return -ENETUNREACH;
 194         }
 195
 196         if (!inet_opt || !inet_opt->opt.srr)
 197                 daddr = fl4->daddr;
 198
 199         if (!inet->inet_saddr)
 200                 inet->inet_saddr = fl4->saddr;
 201         inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                 /* Reset inherited state */
 205                 tp->rx_opt.ts_recent       = 0;
 206                 tp->rx_opt.ts_recent_stamp = 0;
 207                 if (likely(!tp->repair))
 208                         tp->write_seq      = 0;
 209         }
 210
 211         if (tcp_death_row.sysctl_tw_recycle &&
 212             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215         inet->inet_dport = usin->sin_port;
 216         inet->inet_daddr = daddr;
 217
 218         inet_csk(sk)->icsk_ext_hdr_len = 0;
 219         if (inet_opt)
 220                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224         /* Socket identity is still unknown (sport may be zero).
 225          * However we set state to SYN-SENT and not releasing socket
 226          * lock select source port, enter ourselves into the hash tables and
 227          * complete initialization after this.
 228          */
 229         tcp_set_state(sk, TCP_SYN_SENT);
 230         err = inet_hash_connect(&tcp_death_row, sk);
 231         if (err)
 232                 goto failure;
 233
 234         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                                inet->inet_sport, inet->inet_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 rt = NULL;
 239                 goto failure;
 240         }
 241         /* OK, now commit destination to socket.  */
 242         sk->sk_gso_type = SKB_GSO_TCPV4;
 243         sk_setup_caps(sk, &rt->dst);
 244
 245         if (!tp->write_seq && likely(!tp->repair))
 246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                            inet->inet_daddr,
 248                                                            inet->inet_sport,
 249                                                            usin->sin_port);
 250
 251         inet->inet_id = tp->write_seq ^ jiffies;
 252
 253         if (likely(!tp->repair))
 254                 err = tcp_connect(sk);
 255         else
 256                 err = tcp_repair_connect(sk);
 257
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine does path mtu discovery as defined in RFC1191.
 279  */
 280 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 281 {
 282         struct dst_entry *dst;
 283         struct inet_sock *inet = inet_sk(sk);
 284
 285         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 286          * send out by Linux are always <576bytes so they should go through
 287          * unfragmented).
 288          */
 289         if (sk->sk_state == TCP_LISTEN)
 290                 return;
 291
 292         /* We don't check in the destentry if pmtu discovery is forbidden
 293          * on this route. We just assume that no packet_to_big packets
 294          * are send back when pmtu discovery is not active.
 295          * There is a small race when the user changes this flag in the
 296          * route, but I think that's acceptable.
 297          */
 298         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 299                 return;
 300
 301         dst->ops->update_pmtu(dst, mtu);
 302
 303         /* Something is about to be wrong... Remember soft error
 304          * for the case, if this connection will not able to recover.
 305          */
 306         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 307                 sk->sk_err_soft = EMSGSIZE;
 308
 309         mtu = dst_mtu(dst);
 310
 311         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 312             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 313                 tcp_sync_mss(sk, mtu);
 314
 315                 /* Resend the TCP packet because it's
 316                  * clear that the old packet has been
 317                  * dropped. This is the new "fast" path mtu
 318                  * discovery.
 319                  */
 320                 tcp_simple_retransmit(sk);
 321         } /* else let the usual retransmit timer handle it */
 322 }
 323
 324 /*
 325  * This routine is called by the ICMP module when it gets some
 326  * sort of error condition.  If err < 0 then the socket should
 327  * be closed and the error returned to the user.  If err > 0
 328  * it's just the icmp type << 8 | icmp code.  After adjustment
 329  * header points to the first 8 bytes of the tcp header.  We need
 330  * to find the appropriate port.
 331  *
 332  * The locking strategy used here is very "optimistic". When
 333  * someone else accesses the socket the ICMP is just dropped
 334  * and for some paths there is no check at all.
 335  * A more general error queue to queue errors for later handling
 336  * is probably better.
 337  *
 338  */
 339
 340 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 341 {
 342         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 343         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 344         struct inet_connection_sock *icsk;
 345         struct tcp_sock *tp;
 346         struct inet_sock *inet;
 347         const int type = icmp_hdr(icmp_skb)->type;
 348         const int code = icmp_hdr(icmp_skb)->code;
 349         struct sock *sk;
 350         struct sk_buff *skb;
 351         __u32 seq;
 352         __u32 remaining;
 353         int err;
 354         struct net *net = dev_net(icmp_skb->dev);
 355
 356         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 357                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 358                 return;
 359         }
 360
 361         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 362                         iph->saddr, th->source, inet_iif(icmp_skb));
 363         if (!sk) {
 364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 365                 return;
 366         }
 367         if (sk->sk_state == TCP_TIME_WAIT) {
 368                 inet_twsk_put(inet_twsk(sk));
 369                 return;
 370         }
 371
 372         bh_lock_sock(sk);
 373         /* If too many ICMPs get dropped on busy
 374          * servers this needs to be solved differently.
 375          */
 376         if (sock_owned_by_user(sk))
 377                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 378
 379         if (sk->sk_state == TCP_CLOSE)
 380                 goto out;
 381
 382         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 383                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 384                 goto out;
 385         }
 386
 387         icsk = inet_csk(sk);
 388         tp = tcp_sk(sk);
 389         seq = ntohl(th->seq);
 390         if (sk->sk_state != TCP_LISTEN &&
 391             !between(seq, tp->snd_una, tp->snd_nxt)) {
 392                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393                 goto out;
 394         }
 395
 396         switch (type) {
 397         case ICMP_SOURCE_QUENCH:
 398                 /* Just silently ignore these. */
 399                 goto out;
 400         case ICMP_PARAMETERPROB:
 401                 err = EPROTO;
 402                 break;
 403         case ICMP_DEST_UNREACH:
 404                 if (code > NR_ICMP_UNREACH)
 405                         goto out;
 406
 407                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 408                         if (!sock_owned_by_user(sk))
 409                                 do_pmtu_discovery(sk, iph, info);
 410                         goto out;
 411                 }
 412
 413                 err = icmp_err_convert[code].errno;
 414                 /* check if icmp_skb allows revert of backoff
 415                  * (see draft-zimmermann-tcp-lcd) */
 416                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 417                         break;
 418                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 419                     !icsk->icsk_backoff)
 420                         break;
 421
 422                 if (sock_owned_by_user(sk))
 423                         break;
 424
 425                 icsk->icsk_backoff--;
 426                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 427                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 428                 tcp_bound_rto(sk);
 429
 430                 skb = tcp_write_queue_head(sk);
 431                 BUG_ON(!skb);
 432
 433                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 434                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 435
 436                 if (remaining) {
 437                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 438                                                   remaining, TCP_RTO_MAX);
 439                 } else {
 440                         /* RTO revert clocked out retransmission.
 441                          * Will retransmit now */
 442                         tcp_retransmit_timer(sk);
 443                 }
 444
 445                 break;
 446         case ICMP_TIME_EXCEEDED:
 447                 err = EHOSTUNREACH;
 448                 break;
 449         default:
 450                 goto out;
 451         }
 452
 453         switch (sk->sk_state) {
 454                 struct request_sock *req, **prev;
 455         case TCP_LISTEN:
 456                 if (sock_owned_by_user(sk))
 457                         goto out;
 458
 459                 req = inet_csk_search_req(sk, &prev, th->dest,
 460                                           iph->daddr, iph->saddr);
 461                 if (!req)
 462                         goto out;
 463
 464                 /* ICMPs are not backlogged, hence we cannot get
 465                    an established socket here.
 466                  */
 467                 WARN_ON(req->sk);
 468
 469                 if (seq != tcp_rsk(req)->snt_isn) {
 470                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 471                         goto out;
 472                 }
 473
 474                 /*
 475                  * Still in SYN_RECV, just remove it silently.
 476                  * There is no good way to pass the error to the newly
 477                  * created socket, and POSIX does not want network
 478                  * errors returned from accept().
 479                  */
 480                 inet_csk_reqsk_queue_drop(sk, req, prev);
 481                 goto out;
 482
 483         case TCP_SYN_SENT:
 484         case TCP_SYN_RECV:  /* Cannot happen.
 485                                It can f.e. if SYNs crossed.
 486                              */
 487                 if (!sock_owned_by_user(sk)) {
 488                         sk->sk_err = err;
 489
 490                         sk->sk_error_report(sk);
 491
 492                         tcp_done(sk);
 493                 } else {
 494                         sk->sk_err_soft = err;
 495                 }
 496                 goto out;
 497         }
 498
 499         /* If we've already connected we will keep trying
 500          * until we time out, or the user gives up.
 501          *
 502          * rfc1122 4.2.3.9 allows to consider as hard errors
 503          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 504          * but it is obsoleted by pmtu discovery).
 505          *
 506          * Note, that in modern internet, where routing is unreliable
 507          * and in each dark corner broken firewalls sit, sending random
 508          * errors ordered by their masters even this two messages finally lose
 509          * their original sense (even Linux sends invalid PORT_UNREACHs)
 510          *
 511          * Now we are in compliance with RFCs.
 512          *                                                      --ANK (980905)
 513          */
 514
 515         inet = inet_sk(sk);
 516         if (!sock_owned_by_user(sk) && inet->recverr) {
 517                 sk->sk_err = err;
 518                 sk->sk_error_report(sk);
 519         } else  { /* Only an error on timeout */
 520                 sk->sk_err_soft = err;
 521         }
 522
 523 out:
 524         bh_unlock_sock(sk);
 525         sock_put(sk);
 526 }
 527
 528 static void __tcp_v4_send_check(struct sk_buff *skb,
 529                                 __be32 saddr, __be32 daddr)
 530 {
 531         struct tcphdr *th = tcp_hdr(skb);
 532
 533         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 534                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 535                 skb->csum_start = skb_transport_header(skb) - skb->head;
 536                 skb->csum_offset = offsetof(struct tcphdr, check);
 537         } else {
 538                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 539                                          csum_partial(th,
 540                                                       th->doff << 2,
 541                                                       skb->csum));
 542         }
 543 }
 544
 545 /* This routine computes an IPv4 TCP checksum. */
 546 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 547 {
 548         const struct inet_sock *inet = inet_sk(sk);
 549
 550         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 551 }
 552 EXPORT_SYMBOL(tcp_v4_send_check);
 553
 554 int tcp_v4_gso_send_check(struct sk_buff *skb)
 555 {
 556         const struct iphdr *iph;
 557         struct tcphdr *th;
 558
 559         if (!pskb_may_pull(skb, sizeof(*th)))
 560                 return -EINVAL;
 561
 562         iph = ip_hdr(skb);
 563         th = tcp_hdr(skb);
 564
 565         th->check = 0;
 566         skb->ip_summed = CHECKSUM_PARTIAL;
 567         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 568         return 0;
 569 }
 570
 571 /*
 572  *      This routine will send an RST to the other tcp.
 573  *
 574  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 575  *                    for reset.
 576  *      Answer: if a packet caused RST, it is not for a socket
 577  *              existing in our system, if it is matched to a socket,
 578  *              it is just duplicate segment or bug in other side's TCP.
 579  *              So that we build reply only basing on parameters
 580  *              arrived with segment.
 581  *      Exception: precedence violation. We do not implement it in any case.
 582  */
 583
 584 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 585 {
 586         const struct tcphdr *th = tcp_hdr(skb);
 587         struct {
 588                 struct tcphdr th;
 589 #ifdef CONFIG_TCP_MD5SIG
 590                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 591 #endif
 592         } rep;
 593         struct ip_reply_arg arg;
 594 #ifdef CONFIG_TCP_MD5SIG
 595         struct tcp_md5sig_key *key;
 596         const __u8 *hash_location = NULL;
 597         unsigned char newhash[16];
 598         int genhash;
 599         struct sock *sk1 = NULL;
 600 #endif
 601         struct net *net;
 602
 603         /* Never send a reset in response to a reset. */
 604         if (th->rst)
 605                 return;
 606
 607         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 608                 return;
 609
 610         /* Swap the send and the receive. */
 611         memset(&rep, 0, sizeof(rep));
 612         rep.th.dest   = th->source;
 613         rep.th.source = th->dest;
 614         rep.th.doff   = sizeof(struct tcphdr) / 4;
 615         rep.th.rst    = 1;
 616
 617         if (th->ack) {
 618                 rep.th.seq = th->ack_seq;
 619         } else {
 620                 rep.th.ack = 1;
 621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 622                                        skb->len - (th->doff << 2));
 623         }
 624
 625         memset(&arg, 0, sizeof(arg));
 626         arg.iov[0].iov_base = (unsigned char *)&rep;
 627         arg.iov[0].iov_len  = sizeof(rep.th);
 628
 629 #ifdef CONFIG_TCP_MD5SIG
 630         hash_location = tcp_parse_md5sig_option(th);
 631         if (!sk && hash_location) {
 632                 /*
 633                  * active side is lost. Try to find listening socket through
 634                  * source port, and then find md5 key through listening socket.
 635                  * we are not loose security here:
 636                  * Incoming packet is checked with md5 hash with finding key,
 637                  * no RST generated if md5 hash doesn't match.
 638                  */
 639                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 640                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 641                                              ntohs(th->source), inet_iif(skb));
 642                 /* don't send rst if it can't find key */
 643                 if (!sk1)
 644                         return;
 645                 rcu_read_lock();
 646                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 647                                         &ip_hdr(skb)->saddr, AF_INET);
 648                 if (!key)
 649                         goto release_sk1;
 650
 651                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 652                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 653                         goto release_sk1;
 654         } else {
 655                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 656                                              &ip_hdr(skb)->saddr,
 657                                              AF_INET) : NULL;
 658         }
 659
 660         if (key) {
 661                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 662                                    (TCPOPT_NOP << 16) |
 663                                    (TCPOPT_MD5SIG << 8) |
 664                                    TCPOLEN_MD5SIG);
 665                 /* Update length and the length the header thinks exists */
 666                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 667                 rep.th.doff = arg.iov[0].iov_len / 4;
 668
 669                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 670                                      key, ip_hdr(skb)->saddr,
 671                                      ip_hdr(skb)->daddr, &rep.th);
 672         }
 673 #endif
 674         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 675                                       ip_hdr(skb)->saddr, /* XXX */
 676                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 677         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 678         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 679         /* When socket is gone, all binding information is lost.
 680          * routing might fail in this case. using iif for oif to
 681          * make sure we can deliver it
 682          */
 683         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 684
 685         net = dev_net(skb_dst(skb)->dev);
 686         arg.tos = ip_hdr(skb)->tos;
 687         ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 688                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 689
 690         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 691         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 692
 693 #ifdef CONFIG_TCP_MD5SIG
 694 release_sk1:
 695         if (sk1) {
 696                 rcu_read_unlock();
 697                 sock_put(sk1);
 698         }
 699 #endif
 700 }
 701
 702 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 703    outside socket context is ugly, certainly. What can I do?
 704  */
 705
 706 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 707                             u32 win, u32 ts, int oif,
 708                             struct tcp_md5sig_key *key,
 709                             int reply_flags, u8 tos)
 710 {
 711         const struct tcphdr *th = tcp_hdr(skb);
 712         struct {
 713                 struct tcphdr th;
 714                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 715 #ifdef CONFIG_TCP_MD5SIG
 716                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 717 #endif
 718                         ];
 719         } rep;
 720         struct ip_reply_arg arg;
 721         struct net *net = dev_net(skb_dst(skb)->dev);
 722
 723         memset(&rep.th, 0, sizeof(struct tcphdr));
 724         memset(&arg, 0, sizeof(arg));
 725
 726         arg.iov[0].iov_base = (unsigned char *)&rep;
 727         arg.iov[0].iov_len  = sizeof(rep.th);
 728         if (ts) {
 729                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 730                                    (TCPOPT_TIMESTAMP << 8) |
 731                                    TCPOLEN_TIMESTAMP);
 732                 rep.opt[1] = htonl(tcp_time_stamp);
 733                 rep.opt[2] = htonl(ts);
 734                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 735         }
 736
 737         /* Swap the send and the receive. */
 738         rep.th.dest    = th->source;
 739         rep.th.source  = th->dest;
 740         rep.th.doff    = arg.iov[0].iov_len / 4;
 741         rep.th.seq     = htonl(seq);
 742         rep.th.ack_seq = htonl(ack);
 743         rep.th.ack     = 1;
 744         rep.th.window  = htons(win);
 745
 746 #ifdef CONFIG_TCP_MD5SIG
 747         if (key) {
 748                 int offset = (ts) ? 3 : 0;
 749
 750                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 751                                           (TCPOPT_NOP << 16) |
 752                                           (TCPOPT_MD5SIG << 8) |
 753                                           TCPOLEN_MD5SIG);
 754                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 755                 rep.th.doff = arg.iov[0].iov_len/4;
 756
 757                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 758                                     key, ip_hdr(skb)->saddr,
 759                                     ip_hdr(skb)->daddr, &rep.th);
 760         }
 761 #endif
 762         arg.flags = reply_flags;
 763         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 764                                       ip_hdr(skb)->saddr, /* XXX */
 765                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 766         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 767         if (oif)
 768                 arg.bound_dev_if = oif;
 769         arg.tos = tos;
 770         ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 771                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 772
 773         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 774 }
 775
 776 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 777 {
 778         struct inet_timewait_sock *tw = inet_twsk(sk);
 779         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 780
 781         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 782                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 783                         tcptw->tw_ts_recent,
 784                         tw->tw_bound_dev_if,
 785                         tcp_twsk_md5_key(tcptw),
 786                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 787                         tw->tw_tos
 788                         );
 789
 790         inet_twsk_put(tw);
 791 }
 792
 793 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 794                                   struct request_sock *req)
 795 {
 796         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 797                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 798                         req->ts_recent,
 799                         0,
 800                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 801                                           AF_INET),
 802                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 803                         ip_hdr(skb)->tos);
 804 }
 805
 806 /*
 807  *      Send a SYN-ACK after having received a SYN.
 808  *      This still operates on a request_sock only, not on a big
 809  *      socket.
 810  */
 811 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 812                               struct request_sock *req,
 813                               struct request_values *rvp,
 814                               u16 queue_mapping,
 815                               bool nocache)
 816 {
 817         const struct inet_request_sock *ireq = inet_rsk(req);
 818         struct flowi4 fl4;
 819         int err = -1;
 820         struct sk_buff * skb;
 821
 822         /* First, grab a route. */
 823         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
 824                 return -1;
 825
 826         skb = tcp_make_synack(sk, dst, req, rvp);
 827
 828         if (skb) {
 829                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 830
 831                 skb_set_queue_mapping(skb, queue_mapping);
 832                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 833                                             ireq->rmt_addr,
 834                                             ireq->opt);
 835                 err = net_xmit_eval(err);
 836         }
 837
 838         return err;
 839 }
 840
 841 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 842                               struct request_values *rvp)
 843 {
 844         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 845         return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 846 }
 847
 848 /*
 849  *      IPv4 request_sock destructor.
 850  */
 851 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 852 {
 853         kfree(inet_rsk(req)->opt);
 854 }
 855
 856 /*
 857  * Return true if a syncookie should be sent
 858  */
 859 bool tcp_syn_flood_action(struct sock *sk,
 860                          const struct sk_buff *skb,
 861                          const char *proto)
 862 {
 863         const char *msg = "Dropping request";
 864         bool want_cookie = false;
 865         struct listen_sock *lopt;
 866
 867
 868
 869 #ifdef CONFIG_SYN_COOKIES
 870         if (sysctl_tcp_syncookies) {
 871                 msg = "Sending cookies";
 872                 want_cookie = true;
 873                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 874         } else
 875 #endif
 876                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 877
 878         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 879         if (!lopt->synflood_warned) {
 880                 lopt->synflood_warned = 1;
 881                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 882                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 883         }
 884         return want_cookie;
 885 }
 886 EXPORT_SYMBOL(tcp_syn_flood_action);
 887
 888 /*
 889  * Save and compile IPv4 options into the request_sock if needed.
 890  */
 891 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 892                                                   struct sk_buff *skb)
 893 {
 894         const struct ip_options *opt = &(IPCB(skb)->opt);
 895         struct ip_options_rcu *dopt = NULL;
 896
 897         if (opt && opt->optlen) {
 898                 int opt_size = sizeof(*dopt) + opt->optlen;
 899
 900                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 901                 if (dopt) {
 902                         if (ip_options_echo(&dopt->opt, skb)) {
 903                                 kfree(dopt);
 904                                 dopt = NULL;
 905                         }
 906                 }
 907         }
 908         return dopt;
 909 }
 910
 911 #ifdef CONFIG_TCP_MD5SIG
 912 /*
 913  * RFC2385 MD5 checksumming requires a mapping of
 914  * IP address->MD5 Key.
 915  * We need to maintain these in the sk structure.
 916  */
 917
 918 /* Find the Key structure for an address.  */
 919 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 920                                          const union tcp_md5_addr *addr,
 921                                          int family)
 922 {
 923         struct tcp_sock *tp = tcp_sk(sk);
 924         struct tcp_md5sig_key *key;
 925         struct hlist_node *pos;
 926         unsigned int size = sizeof(struct in_addr);
 927         struct tcp_md5sig_info *md5sig;
 928
 929         /* caller either holds rcu_read_lock() or socket lock */
 930         md5sig = rcu_dereference_check(tp->md5sig_info,
 931                                        sock_owned_by_user(sk) ||
 932                                        lockdep_is_held(&sk->sk_lock.slock));
 933         if (!md5sig)
 934                 return NULL;
 935 #if IS_ENABLED(CONFIG_IPV6)
 936         if (family == AF_INET6)
 937                 size = sizeof(struct in6_addr);
 938 #endif
 939         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 940                 if (key->family != family)
 941                         continue;
 942                 if (!memcmp(&key->addr, addr, size))
 943                         return key;
 944         }
 945         return NULL;
 946 }
 947 EXPORT_SYMBOL(tcp_md5_do_lookup);
 948
 949 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 950                                          struct sock *addr_sk)
 951 {
 952         union tcp_md5_addr *addr;
 953
 954         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 955         return tcp_md5_do_lookup(sk, addr, AF_INET);
 956 }
 957 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 958
 959 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 960                                                       struct request_sock *req)
 961 {
 962         union tcp_md5_addr *addr;
 963
 964         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 965         return tcp_md5_do_lookup(sk, addr, AF_INET);
 966 }
 967
 968 /* This can be called on a newly created socket, from other files */
 969 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 970                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 971 {
 972         /* Add Key to the list */
 973         struct tcp_md5sig_key *key;
 974         struct tcp_sock *tp = tcp_sk(sk);
 975         struct tcp_md5sig_info *md5sig;
 976
 977         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
 978         if (key) {
 979                 /* Pre-existing entry - just update that one. */
 980                 memcpy(key->key, newkey, newkeylen);
 981                 key->keylen = newkeylen;
 982                 return 0;
 983         }
 984
 985         md5sig = rcu_dereference_protected(tp->md5sig_info,
 986                                            sock_owned_by_user(sk));
 987         if (!md5sig) {
 988                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 989                 if (!md5sig)
 990                         return -ENOMEM;
 991
 992                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 993                 INIT_HLIST_HEAD(&md5sig->head);
 994                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 995         }
 996
 997         key = sock_kmalloc(sk, sizeof(*key), gfp);
 998         if (!key)
 999                 return -ENOMEM;
1000         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1001                 sock_kfree_s(sk, key, sizeof(*key));
1002                 return -ENOMEM;
1003         }
1004
1005         memcpy(key->key, newkey, newkeylen);
1006         key->keylen = newkeylen;
1007         key->family = family;
1008         memcpy(&key->addr, addr,
1009                (family == AF_INET6) ? sizeof(struct in6_addr) :
1010                                       sizeof(struct in_addr));
1011         hlist_add_head_rcu(&key->node, &md5sig->head);
1012         return 0;
1013 }
1014 EXPORT_SYMBOL(tcp_md5_do_add);
1015
1016 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1017 {
1018         struct tcp_sock *tp = tcp_sk(sk);
1019         struct tcp_md5sig_key *key;
1020         struct tcp_md5sig_info *md5sig;
1021
1022         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1023         if (!key)
1024                 return -ENOENT;
1025         hlist_del_rcu(&key->node);
1026         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1027         kfree_rcu(key, rcu);
1028         md5sig = rcu_dereference_protected(tp->md5sig_info,
1029                                            sock_owned_by_user(sk));
1030         if (hlist_empty(&md5sig->head))
1031                 tcp_free_md5sig_pool();
1032         return 0;
1033 }
1034 EXPORT_SYMBOL(tcp_md5_do_del);
1035
1036 void tcp_clear_md5_list(struct sock *sk)
1037 {
1038         struct tcp_sock *tp = tcp_sk(sk);
1039         struct tcp_md5sig_key *key;
1040         struct hlist_node *pos, *n;
1041         struct tcp_md5sig_info *md5sig;
1042
1043         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1044
1045         if (!hlist_empty(&md5sig->head))
1046                 tcp_free_md5sig_pool();
1047         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1048                 hlist_del_rcu(&key->node);
1049                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1050                 kfree_rcu(key, rcu);
1051         }
1052 }
1053
1054 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1055                                  int optlen)
1056 {
1057         struct tcp_md5sig cmd;
1058         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1059
1060         if (optlen < sizeof(cmd))
1061                 return -EINVAL;
1062
1063         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1064                 return -EFAULT;
1065
1066         if (sin->sin_family != AF_INET)
1067                 return -EINVAL;
1068
1069         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1070                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1071                                       AF_INET);
1072
1073         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1074                 return -EINVAL;
1075
1076         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1077                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1078                               GFP_KERNEL);
1079 }
1080
1081 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1082                                         __be32 daddr, __be32 saddr, int nbytes)
1083 {
1084         struct tcp4_pseudohdr *bp;
1085         struct scatterlist sg;
1086
1087         bp = &hp->md5_blk.ip4;
1088
1089         /*
1090          * 1. the TCP pseudo-header (in the order: source IP address,
1091          * destination IP address, zero-padded protocol number, and
1092          * segment length)
1093          */
1094         bp->saddr = saddr;
1095         bp->daddr = daddr;
1096         bp->pad = 0;
1097         bp->protocol = IPPROTO_TCP;
1098         bp->len = cpu_to_be16(nbytes);
1099
1100         sg_init_one(&sg, bp, sizeof(*bp));
1101         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1102 }
1103
1104 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1105                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1106 {
1107         struct tcp_md5sig_pool *hp;
1108         struct hash_desc *desc;
1109
1110         hp = tcp_get_md5sig_pool();
1111         if (!hp)
1112                 goto clear_hash_noput;
1113         desc = &hp->md5_desc;
1114
1115         if (crypto_hash_init(desc))
1116                 goto clear_hash;
1117         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1118                 goto clear_hash;
1119         if (tcp_md5_hash_header(hp, th))
1120                 goto clear_hash;
1121         if (tcp_md5_hash_key(hp, key))
1122                 goto clear_hash;
1123         if (crypto_hash_final(desc, md5_hash))
1124                 goto clear_hash;
1125
1126         tcp_put_md5sig_pool();
1127         return 0;
1128
1129 clear_hash:
1130         tcp_put_md5sig_pool();
1131 clear_hash_noput:
1132         memset(md5_hash, 0, 16);
1133         return 1;
1134 }
1135
1136 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1137                         const struct sock *sk, const struct request_sock *req,
1138                         const struct sk_buff *skb)
1139 {
1140         struct tcp_md5sig_pool *hp;
1141         struct hash_desc *desc;
1142         const struct tcphdr *th = tcp_hdr(skb);
1143         __be32 saddr, daddr;
1144
1145         if (sk) {
1146                 saddr = inet_sk(sk)->inet_saddr;
1147                 daddr = inet_sk(sk)->inet_daddr;
1148         } else if (req) {
1149                 saddr = inet_rsk(req)->loc_addr;
1150                 daddr = inet_rsk(req)->rmt_addr;
1151         } else {
1152                 const struct iphdr *iph = ip_hdr(skb);
1153                 saddr = iph->saddr;
1154                 daddr = iph->daddr;
1155         }
1156
1157         hp = tcp_get_md5sig_pool();
1158         if (!hp)
1159                 goto clear_hash_noput;
1160         desc = &hp->md5_desc;
1161
1162         if (crypto_hash_init(desc))
1163                 goto clear_hash;
1164
1165         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1166                 goto clear_hash;
1167         if (tcp_md5_hash_header(hp, th))
1168                 goto clear_hash;
1169         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1170                 goto clear_hash;
1171         if (tcp_md5_hash_key(hp, key))
1172                 goto clear_hash;
1173         if (crypto_hash_final(desc, md5_hash))
1174                 goto clear_hash;
1175
1176         tcp_put_md5sig_pool();
1177         return 0;
1178
1179 clear_hash:
1180         tcp_put_md5sig_pool();
1181 clear_hash_noput:
1182         memset(md5_hash, 0, 16);
1183         return 1;
1184 }
1185 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1186
1187 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1188 {
1189         /*
1190          * This gets called for each TCP segment that arrives
1191          * so we want to be efficient.
1192          * We have 3 drop cases:
1193          * o No MD5 hash and one expected.
1194          * o MD5 hash and we're not expecting one.
1195          * o MD5 hash and its wrong.
1196          */
1197         const __u8 *hash_location = NULL;
1198         struct tcp_md5sig_key *hash_expected;
1199         const struct iphdr *iph = ip_hdr(skb);
1200         const struct tcphdr *th = tcp_hdr(skb);
1201         int genhash;
1202         unsigned char newhash[16];
1203
1204         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1205                                           AF_INET);
1206         hash_location = tcp_parse_md5sig_option(th);
1207
1208         /* We've parsed the options - do we have a hash? */
1209         if (!hash_expected && !hash_location)
1210                 return false;
1211
1212         if (hash_expected && !hash_location) {
1213                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1214                 return true;
1215         }
1216
1217         if (!hash_expected && hash_location) {
1218                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1219                 return true;
1220         }
1221
1222         /* Okay, so this is hash_expected and hash_location -
1223          * so we need to calculate the checksum.
1224          */
1225         genhash = tcp_v4_md5_hash_skb(newhash,
1226                                       hash_expected,
1227                                       NULL, NULL, skb);
1228
1229         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1230                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1231                                      &iph->saddr, ntohs(th->source),
1232                                      &iph->daddr, ntohs(th->dest),
1233                                      genhash ? " tcp_v4_calc_md5_hash failed"
1234                                      : "");
1235                 return true;
1236         }
1237         return false;
1238 }
1239
1240 #endif
1241
1242 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1243         .family         =       PF_INET,
1244         .obj_size       =       sizeof(struct tcp_request_sock),
1245         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1246         .send_ack       =       tcp_v4_reqsk_send_ack,
1247         .destructor     =       tcp_v4_reqsk_destructor,
1248         .send_reset     =       tcp_v4_send_reset,
1249         .syn_ack_timeout =      tcp_syn_ack_timeout,
1250 };
1251
1252 #ifdef CONFIG_TCP_MD5SIG
1253 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1254         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1255         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1256 };
1257 #endif
1258
1259 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1260 {
1261         struct tcp_extend_values tmp_ext;
1262         struct tcp_options_received tmp_opt;
1263         const u8 *hash_location;
1264         struct request_sock *req;
1265         struct inet_request_sock *ireq;
1266         struct tcp_sock *tp = tcp_sk(sk);
1267         struct dst_entry *dst = NULL;
1268         __be32 saddr = ip_hdr(skb)->saddr;
1269         __be32 daddr = ip_hdr(skb)->daddr;
1270         __u32 isn = TCP_SKB_CB(skb)->when;
1271         bool want_cookie = false;
1272
1273         /* Never answer to SYNs send to broadcast or multicast */
1274         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1275                 goto drop;
1276
1277         /* TW buckets are converted to open requests without
1278          * limitations, they conserve resources and peer is
1279          * evidently real one.
1280          */
1281         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1282                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1283                 if (!want_cookie)
1284                         goto drop;
1285         }
1286
1287         /* Accept backlog is full. If we have already queued enough
1288          * of warm entries in syn queue, drop request. It is better than
1289          * clogging syn queue with openreqs with exponentially increasing
1290          * timeout.
1291          */
1292         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1293                 goto drop;
1294
1295         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1296         if (!req)
1297                 goto drop;
1298
1299 #ifdef CONFIG_TCP_MD5SIG
1300         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1301 #endif
1302
1303         tcp_clear_options(&tmp_opt);
1304         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1305         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1306         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1307
1308         if (tmp_opt.cookie_plus > 0 &&
1309             tmp_opt.saw_tstamp &&
1310             !tp->rx_opt.cookie_out_never &&
1311             (sysctl_tcp_cookie_size > 0 ||
1312              (tp->cookie_values != NULL &&
1313               tp->cookie_values->cookie_desired > 0))) {
1314                 u8 *c;
1315                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1316                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1317
1318                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1319                         goto drop_and_release;
1320
1321                 /* Secret recipe starts with IP addresses */
1322                 *mess++ ^= (__force u32)daddr;
1323                 *mess++ ^= (__force u32)saddr;
1324
1325                 /* plus variable length Initiator Cookie */
1326                 c = (u8 *)mess;
1327                 while (l-- > 0)
1328                         *c++ ^= *hash_location++;
1329
1330                 want_cookie = false;    /* not our kind of cookie */
1331                 tmp_ext.cookie_out_never = 0; /* false */
1332                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1333         } else if (!tp->rx_opt.cookie_in_always) {
1334                 /* redundant indications, but ensure initialization. */
1335                 tmp_ext.cookie_out_never = 1; /* true */
1336                 tmp_ext.cookie_plus = 0;
1337         } else {
1338                 goto drop_and_release;
1339         }
1340         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1341
1342         if (want_cookie && !tmp_opt.saw_tstamp)
1343                 tcp_clear_options(&tmp_opt);
1344
1345         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1346         tcp_openreq_init(req, &tmp_opt, skb);
1347
1348         ireq = inet_rsk(req);
1349         ireq->loc_addr = daddr;
1350         ireq->rmt_addr = saddr;
1351         ireq->no_srccheck = inet_sk(sk)->transparent;
1352         ireq->opt = tcp_v4_save_options(sk, skb);
1353
1354         if (security_inet_conn_request(sk, skb, req))
1355                 goto drop_and_free;
1356
1357         if (!want_cookie || tmp_opt.tstamp_ok)
1358                 TCP_ECN_create_request(req, skb);
1359
1360         if (want_cookie) {
1361                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1362                 req->cookie_ts = tmp_opt.tstamp_ok;
1363         } else if (!isn) {
1364                 struct flowi4 fl4;
1365
1366                 /* VJ's idea. We save last timestamp seen
1367                  * from the destination in peer table, when entering
1368                  * state TIME-WAIT, and check against it before
1369                  * accepting new connection request.
1370                  *
1371                  * If "isn" is not zero, this request hit alive
1372                  * timewait bucket, so that all the necessary checks
1373                  * are made in the function processing timewait state.
1374                  */
1375                 if (tmp_opt.saw_tstamp &&
1376                     tcp_death_row.sysctl_tw_recycle &&
1377                     (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
1378                     fl4.daddr == saddr) {
1379                         if (!tcp_peer_is_proven(req, dst, true)) {
1380                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1381                                 goto drop_and_release;
1382                         }
1383                 }
1384                 /* Kill the following clause, if you dislike this way. */
1385                 else if (!sysctl_tcp_syncookies &&
1386                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1387                           (sysctl_max_syn_backlog >> 2)) &&
1388                          !tcp_peer_is_proven(req, dst, false)) {
1389                         /* Without syncookies last quarter of
1390                          * backlog is filled with destinations,
1391                          * proven to be alive.
1392                          * It means that we continue to communicate
1393                          * to destinations, already remembered
1394                          * to the moment of synflood.
1395                          */
1396                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1397                                        &saddr, ntohs(tcp_hdr(skb)->source));
1398                         goto drop_and_release;
1399                 }
1400
1401                 isn = tcp_v4_init_sequence(skb);
1402         }
1403         tcp_rsk(req)->snt_isn = isn;
1404         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1405
1406         if (tcp_v4_send_synack(sk, dst, req,
1407                                (struct request_values *)&tmp_ext,
1408                                skb_get_queue_mapping(skb),
1409                                want_cookie) ||
1410             want_cookie)
1411                 goto drop_and_free;
1412
1413         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1414         return 0;
1415
1416 drop_and_release:
1417         dst_release(dst);
1418 drop_and_free:
1419         reqsk_free(req);
1420 drop:
1421         return 0;
1422 }
1423 EXPORT_SYMBOL(tcp_v4_conn_request);
1424
1425
1426 /*
1427  * The three way handshake has completed - we got a valid synack -
1428  * now create the new socket.
1429  */
1430 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1431                                   struct request_sock *req,
1432                                   struct dst_entry *dst)
1433 {
1434         struct inet_request_sock *ireq;
1435         struct inet_sock *newinet;
1436         struct tcp_sock *newtp;
1437         struct sock *newsk;
1438 #ifdef CONFIG_TCP_MD5SIG
1439         struct tcp_md5sig_key *key;
1440 #endif
1441         struct ip_options_rcu *inet_opt;
1442
1443         if (sk_acceptq_is_full(sk))
1444                 goto exit_overflow;
1445
1446         newsk = tcp_create_openreq_child(sk, req, skb);
1447         if (!newsk)
1448                 goto exit_nonewsk;
1449
1450         newsk->sk_gso_type = SKB_GSO_TCPV4;
1451
1452         newtp                 = tcp_sk(newsk);
1453         newinet               = inet_sk(newsk);
1454         ireq                  = inet_rsk(req);
1455         newinet->inet_daddr   = ireq->rmt_addr;
1456         newinet->inet_rcv_saddr = ireq->loc_addr;
1457         newinet->inet_saddr           = ireq->loc_addr;
1458         inet_opt              = ireq->opt;
1459         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1460         ireq->opt             = NULL;
1461         newinet->mc_index     = inet_iif(skb);
1462         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1463         newinet->rcv_tos      = ip_hdr(skb)->tos;
1464         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1465         if (inet_opt)
1466                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1467         newinet->inet_id = newtp->write_seq ^ jiffies;
1468
1469         if (!dst) {
1470                 dst = inet_csk_route_child_sock(sk, newsk, req);
1471                 if (!dst)
1472                         goto put_and_exit;
1473         } else {
1474                 /* syncookie case : see end of cookie_v4_check() */
1475         }
1476         sk_setup_caps(newsk, dst);
1477
1478         tcp_mtup_init(newsk);
1479         tcp_sync_mss(newsk, dst_mtu(dst));
1480         newtp->advmss = dst_metric_advmss(dst);
1481         if (tcp_sk(sk)->rx_opt.user_mss &&
1482             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1483                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1484
1485         tcp_initialize_rcv_mss(newsk);
1486         if (tcp_rsk(req)->snt_synack)
1487                 tcp_valid_rtt_meas(newsk,
1488                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1489         newtp->total_retrans = req->retrans;
1490
1491 #ifdef CONFIG_TCP_MD5SIG
1492         /* Copy over the MD5 key from the original socket */
1493         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1494                                 AF_INET);
1495         if (key != NULL) {
1496                 /*
1497                  * We're using one, so create a matching key
1498                  * on the newsk structure. If we fail to get
1499                  * memory, then we end up not copying the key
1500                  * across. Shucks.
1501                  */
1502                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1503                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1504                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1505         }
1506 #endif
1507
1508         if (__inet_inherit_port(sk, newsk) < 0)
1509                 goto put_and_exit;
1510         __inet_hash_nolisten(newsk, NULL);
1511
1512         return newsk;
1513
1514 exit_overflow:
1515         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1516 exit_nonewsk:
1517         dst_release(dst);
1518 exit:
1519         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1520         return NULL;
1521 put_and_exit:
1522         tcp_clear_xmit_timers(newsk);
1523         tcp_cleanup_congestion_control(newsk);
1524         bh_unlock_sock(newsk);
1525         sock_put(newsk);
1526         goto exit;
1527 }
1528 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1529
1530 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1531 {
1532         struct tcphdr *th = tcp_hdr(skb);
1533         const struct iphdr *iph = ip_hdr(skb);
1534         struct sock *nsk;
1535         struct request_sock **prev;
1536         /* Find possible connection requests. */
1537         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1538                                                        iph->saddr, iph->daddr);
1539         if (req)
1540                 return tcp_check_req(sk, skb, req, prev);
1541
1542         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1543                         th->source, iph->daddr, th->dest, inet_iif(skb));
1544
1545         if (nsk) {
1546                 if (nsk->sk_state != TCP_TIME_WAIT) {
1547                         bh_lock_sock(nsk);
1548                         return nsk;
1549                 }
1550                 inet_twsk_put(inet_twsk(nsk));
1551                 return NULL;
1552         }
1553
1554 #ifdef CONFIG_SYN_COOKIES
1555         if (!th->syn)
1556                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1557 #endif
1558         return sk;
1559 }
1560
1561 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1562 {
1563         const struct iphdr *iph = ip_hdr(skb);
1564
1565         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1566                 if (!tcp_v4_check(skb->len, iph->saddr,
1567                                   iph->daddr, skb->csum)) {
1568                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1569                         return 0;
1570                 }
1571         }
1572
1573         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1574                                        skb->len, IPPROTO_TCP, 0);
1575
1576         if (skb->len <= 76) {
1577                 return __skb_checksum_complete(skb);
1578         }
1579         return 0;
1580 }
1581
1582
1583 /* The socket must have it's spinlock held when we get
1584  * here.
1585  *
1586  * We have a potential double-lock case here, so even when
1587  * doing backlog processing we use the BH locking scheme.
1588  * This is because we cannot sleep with the original spinlock
1589  * held.
1590  */
1591 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1592 {
1593         struct sock *rsk;
1594 #ifdef CONFIG_TCP_MD5SIG
1595         /*
1596          * We really want to reject the packet as early as possible
1597          * if:
1598          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1599          *  o There is an MD5 option and we're not expecting one
1600          */
1601         if (tcp_v4_inbound_md5_hash(sk, skb))
1602                 goto discard;
1603 #endif
1604
1605         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1606                 sock_rps_save_rxhash(sk, skb);
1607                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1608                         rsk = sk;
1609                         goto reset;
1610                 }
1611                 return 0;
1612         }
1613
1614         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1615                 goto csum_err;
1616
1617         if (sk->sk_state == TCP_LISTEN) {
1618                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1619                 if (!nsk)
1620                         goto discard;
1621
1622                 if (nsk != sk) {
1623                         sock_rps_save_rxhash(nsk, skb);
1624                         if (tcp_child_process(sk, nsk, skb)) {
1625                                 rsk = nsk;
1626                                 goto reset;
1627                         }
1628                         return 0;
1629                 }
1630         } else
1631                 sock_rps_save_rxhash(sk, skb);
1632
1633         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1634                 rsk = sk;
1635                 goto reset;
1636         }
1637         return 0;
1638
1639 reset:
1640         tcp_v4_send_reset(rsk, skb);
1641 discard:
1642         kfree_skb(skb);
1643         /* Be careful here. If this function gets more complicated and
1644          * gcc suffers from register pressure on the x86, sk (in %ebx)
1645          * might be destroyed here. This current version compiles correctly,
1646          * but you have been warned.
1647          */
1648         return 0;
1649
1650 csum_err:
1651         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1652         goto discard;
1653 }
1654 EXPORT_SYMBOL(tcp_v4_do_rcv);
1655
1656 void tcp_v4_early_demux(struct sk_buff *skb)
1657 {
1658         struct net *net = dev_net(skb->dev);
1659         const struct iphdr *iph;
1660         const struct tcphdr *th;
1661         struct net_device *dev;
1662         struct sock *sk;
1663
1664         if (skb->pkt_type != PACKET_HOST)
1665                 return;
1666
1667         if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1668                 return;
1669
1670         iph = ip_hdr(skb);
1671         th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1672
1673         if (th->doff < sizeof(struct tcphdr) / 4)
1674                 return;
1675
1676         if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1677                 return;
1678
1679         dev = skb->dev;
1680         sk = __inet_lookup_established(net, &tcp_hashinfo,
1681                                        iph->saddr, th->source,
1682                                        iph->daddr, ntohs(th->dest),
1683                                        dev->ifindex);
1684         if (sk) {
1685                 skb->sk = sk;
1686                 skb->destructor = sock_edemux;
1687                 if (sk->sk_state != TCP_TIME_WAIT) {
1688                         struct dst_entry *dst = sk->sk_rx_dst;
1689                         if (dst)
1690                                 dst = dst_check(dst, 0);
1691                         if (dst) {
1692                                 struct rtable *rt = (struct rtable *) dst;
1693
1694                                 if (rt->rt_iif == dev->ifindex)
1695                                         skb_dst_set_noref(skb, dst);
1696                         }
1697                 }
1698         }
1699 }
1700
1701 /*
1702  *      From tcp_input.c
1703  */
1704
1705 int tcp_v4_rcv(struct sk_buff *skb)
1706 {
1707         const struct iphdr *iph;
1708         const struct tcphdr *th;
1709         struct sock *sk;
1710         int ret;
1711         struct net *net = dev_net(skb->dev);
1712
1713         if (skb->pkt_type != PACKET_HOST)
1714                 goto discard_it;
1715
1716         /* Count it even if it's bad */
1717         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1718
1719         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1720                 goto discard_it;
1721
1722         th = tcp_hdr(skb);
1723
1724         if (th->doff < sizeof(struct tcphdr) / 4)
1725                 goto bad_packet;
1726         if (!pskb_may_pull(skb, th->doff * 4))
1727                 goto discard_it;
1728
1729         /* An explanation is required here, I think.
1730          * Packet length and doff are validated by header prediction,
1731          * provided case of th->doff==0 is eliminated.
1732          * So, we defer the checks. */
1733         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1734                 goto bad_packet;
1735
1736         th = tcp_hdr(skb);
1737         iph = ip_hdr(skb);
1738         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1739         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1740                                     skb->len - th->doff * 4);
1741         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1742         TCP_SKB_CB(skb)->when    = 0;
1743         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1744         TCP_SKB_CB(skb)->sacked  = 0;
1745
1746         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1747         if (!sk)
1748                 goto no_tcp_socket;
1749
1750 process:
1751         if (sk->sk_state == TCP_TIME_WAIT)
1752                 goto do_time_wait;
1753
1754         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1755                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1756                 goto discard_and_relse;
1757         }
1758
1759         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1760                 goto discard_and_relse;
1761         nf_reset(skb);
1762
1763         if (sk_filter(sk, skb))
1764                 goto discard_and_relse;
1765
1766         skb->dev = NULL;
1767
1768         bh_lock_sock_nested(sk);
1769         ret = 0;
1770         if (!sock_owned_by_user(sk)) {
1771 #ifdef CONFIG_NET_DMA
1772                 struct tcp_sock *tp = tcp_sk(sk);
1773                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1774                         tp->ucopy.dma_chan = net_dma_find_channel();
1775                 if (tp->ucopy.dma_chan)
1776                         ret = tcp_v4_do_rcv(sk, skb);
1777                 else
1778 #endif
1779                 {
1780                         if (!tcp_prequeue(sk, skb))
1781                                 ret = tcp_v4_do_rcv(sk, skb);
1782                 }
1783         } else if (unlikely(sk_add_backlog(sk, skb,
1784                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1785                 bh_unlock_sock(sk);
1786                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1787                 goto discard_and_relse;
1788         }
1789         bh_unlock_sock(sk);
1790
1791         sock_put(sk);
1792
1793         return ret;
1794
1795 no_tcp_socket:
1796         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1797                 goto discard_it;
1798
1799         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1800 bad_packet:
1801                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1802         } else {
1803                 tcp_v4_send_reset(NULL, skb);
1804         }
1805
1806 discard_it:
1807         /* Discard frame. */
1808         kfree_skb(skb);
1809         return 0;
1810
1811 discard_and_relse:
1812         sock_put(sk);
1813         goto discard_it;
1814
1815 do_time_wait:
1816         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1817                 inet_twsk_put(inet_twsk(sk));
1818                 goto discard_it;
1819         }
1820
1821         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1822                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1823                 inet_twsk_put(inet_twsk(sk));
1824                 goto discard_it;
1825         }
1826         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1827         case TCP_TW_SYN: {
1828                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1829                                                         &tcp_hashinfo,
1830                                                         iph->daddr, th->dest,
1831                                                         inet_iif(skb));
1832                 if (sk2) {
1833                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1834                         inet_twsk_put(inet_twsk(sk));
1835                         sk = sk2;
1836                         goto process;
1837                 }
1838                 /* Fall through to ACK */
1839         }
1840         case TCP_TW_ACK:
1841                 tcp_v4_timewait_ack(sk, skb);
1842                 break;
1843         case TCP_TW_RST:
1844                 goto no_tcp_socket;
1845         case TCP_TW_SUCCESS:;
1846         }
1847         goto discard_it;
1848 }
1849
1850 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1851         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1852         .twsk_unique    = tcp_twsk_unique,
1853         .twsk_destructor= tcp_twsk_destructor,
1854 };
1855
1856 const struct inet_connection_sock_af_ops ipv4_specific = {
1857         .queue_xmit        = ip_queue_xmit,
1858         .send_check        = tcp_v4_send_check,
1859         .rebuild_header    = inet_sk_rebuild_header,
1860         .conn_request      = tcp_v4_conn_request,
1861         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1862         .net_header_len    = sizeof(struct iphdr),
1863         .setsockopt        = ip_setsockopt,
1864         .getsockopt        = ip_getsockopt,
1865         .addr2sockaddr     = inet_csk_addr2sockaddr,
1866         .sockaddr_len      = sizeof(struct sockaddr_in),
1867         .bind_conflict     = inet_csk_bind_conflict,
1868 #ifdef CONFIG_COMPAT
1869         .compat_setsockopt = compat_ip_setsockopt,
1870         .compat_getsockopt = compat_ip_getsockopt,
1871 #endif
1872 };
1873 EXPORT_SYMBOL(ipv4_specific);
1874
1875 #ifdef CONFIG_TCP_MD5SIG
1876 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1877         .md5_lookup             = tcp_v4_md5_lookup,
1878         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1879         .md5_parse              = tcp_v4_parse_md5_keys,
1880 };
1881 #endif
1882
1883 /* NOTE: A lot of things set to zero explicitly by call to
1884  *       sk_alloc() so need not be done here.
1885  */
1886 static int tcp_v4_init_sock(struct sock *sk)
1887 {
1888         struct inet_connection_sock *icsk = inet_csk(sk);
1889
1890         tcp_init_sock(sk);
1891
1892         icsk->icsk_af_ops = &ipv4_specific;
1893
1894 #ifdef CONFIG_TCP_MD5SIG
1895         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1896 #endif
1897
1898         return 0;
1899 }
1900
1901 void tcp_v4_destroy_sock(struct sock *sk)
1902 {
1903         struct tcp_sock *tp = tcp_sk(sk);
1904
1905         tcp_clear_xmit_timers(sk);
1906
1907         tcp_cleanup_congestion_control(sk);
1908
1909         /* Cleanup up the write buffer. */
1910         tcp_write_queue_purge(sk);
1911
1912         /* Cleans up our, hopefully empty, out_of_order_queue. */
1913         __skb_queue_purge(&tp->out_of_order_queue);
1914
1915 #ifdef CONFIG_TCP_MD5SIG
1916         /* Clean up the MD5 key list, if any */
1917         if (tp->md5sig_info) {
1918                 tcp_clear_md5_list(sk);
1919                 kfree_rcu(tp->md5sig_info, rcu);
1920                 tp->md5sig_info = NULL;
1921         }
1922 #endif
1923
1924 #ifdef CONFIG_NET_DMA
1925         /* Cleans up our sk_async_wait_queue */
1926         __skb_queue_purge(&sk->sk_async_wait_queue);
1927 #endif
1928
1929         /* Clean prequeue, it must be empty really */
1930         __skb_queue_purge(&tp->ucopy.prequeue);
1931
1932         /* Clean up a referenced TCP bind bucket. */
1933         if (inet_csk(sk)->icsk_bind_hash)
1934                 inet_put_port(sk);
1935
1936         /*
1937          * If sendmsg cached page exists, toss it.
1938          */
1939         if (sk->sk_sndmsg_page) {
1940                 __free_page(sk->sk_sndmsg_page);
1941                 sk->sk_sndmsg_page = NULL;
1942         }
1943
1944         /* TCP Cookie Transactions */
1945         if (tp->cookie_values != NULL) {
1946                 kref_put(&tp->cookie_values->kref,
1947                          tcp_cookie_values_release);
1948                 tp->cookie_values = NULL;
1949         }
1950
1951         sk_sockets_allocated_dec(sk);
1952         sock_release_memcg(sk);
1953 }
1954 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1955
1956 #ifdef CONFIG_PROC_FS
1957 /* Proc filesystem TCP sock list dumping. */
1958
1959 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1960 {
1961         return hlist_nulls_empty(head) ? NULL :
1962                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1963 }
1964
1965 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1966 {
1967         return !is_a_nulls(tw->tw_node.next) ?
1968                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1969 }
1970
1971 /*
1972  * Get next listener socket follow cur.  If cur is NULL, get first socket
1973  * starting from bucket given in st->bucket; when st->bucket is zero the
1974  * very first socket in the hash table is returned.
1975  */
1976 static void *listening_get_next(struct seq_file *seq, void *cur)
1977 {
1978         struct inet_connection_sock *icsk;
1979         struct hlist_nulls_node *node;
1980         struct sock *sk = cur;
1981         struct inet_listen_hashbucket *ilb;
1982         struct tcp_iter_state *st = seq->private;
1983         struct net *net = seq_file_net(seq);
1984
1985         if (!sk) {
1986                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1987                 spin_lock_bh(&ilb->lock);
1988                 sk = sk_nulls_head(&ilb->head);
1989                 st->offset = 0;
1990                 goto get_sk;
1991         }
1992         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1993         ++st->num;
1994         ++st->offset;
1995
1996         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1997                 struct request_sock *req = cur;
1998
1999                 icsk = inet_csk(st->syn_wait_sk);
2000                 req = req->dl_next;
2001                 while (1) {
2002                         while (req) {
2003                                 if (req->rsk_ops->family == st->family) {
2004                                         cur = req;
2005                                         goto out;
2006                                 }
2007                                 req = req->dl_next;
2008                         }
2009                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2010                                 break;
2011 get_req:
2012                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2013                 }
2014                 sk        = sk_nulls_next(st->syn_wait_sk);
2015                 st->state = TCP_SEQ_STATE_LISTENING;
2016                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2017         } else {
2018                 icsk = inet_csk(sk);
2019                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2020                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2021                         goto start_req;
2022                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2023                 sk = sk_nulls_next(sk);
2024         }
2025 get_sk:
2026         sk_nulls_for_each_from(sk, node) {
2027                 if (!net_eq(sock_net(sk), net))
2028                         continue;
2029                 if (sk->sk_family == st->family) {
2030                         cur = sk;
2031                         goto out;
2032                 }
2033                 icsk = inet_csk(sk);
2034                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2035                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2036 start_req:
2037                         st->uid         = sock_i_uid(sk);
2038                         st->syn_wait_sk = sk;
2039                         st->state       = TCP_SEQ_STATE_OPENREQ;
2040                         st->sbucket     = 0;
2041                         goto get_req;
2042                 }
2043                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2044         }
2045         spin_unlock_bh(&ilb->lock);
2046         st->offset = 0;
2047         if (++st->bucket < INET_LHTABLE_SIZE) {
2048                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2049                 spin_lock_bh(&ilb->lock);
2050                 sk = sk_nulls_head(&ilb->head);
2051                 goto get_sk;
2052         }
2053         cur = NULL;
2054 out:
2055         return cur;
2056 }
2057
2058 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2059 {
2060         struct tcp_iter_state *st = seq->private;
2061         void *rc;
2062
2063         st->bucket = 0;
2064         st->offset = 0;
2065         rc = listening_get_next(seq, NULL);
2066
2067         while (rc && *pos) {
2068                 rc = listening_get_next(seq, rc);
2069                 --*pos;
2070         }
2071         return rc;
2072 }
2073
2074 static inline bool empty_bucket(struct tcp_iter_state *st)
2075 {
2076         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2077                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2078 }
2079
2080 /*
2081  * Get first established socket starting from bucket given in st->bucket.
2082  * If st->bucket is zero, the very first socket in the hash is returned.
2083  */
2084 static void *established_get_first(struct seq_file *seq)
2085 {
2086         struct tcp_iter_state *st = seq->private;
2087         struct net *net = seq_file_net(seq);
2088         void *rc = NULL;
2089
2090         st->offset = 0;
2091         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2092                 struct sock *sk;
2093                 struct hlist_nulls_node *node;
2094                 struct inet_timewait_sock *tw;
2095                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2096
2097                 /* Lockless fast path for the common case of empty buckets */
2098                 if (empty_bucket(st))
2099                         continue;
2100
2101                 spin_lock_bh(lock);
2102                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2103                         if (sk->sk_family != st->family ||
2104                             !net_eq(sock_net(sk), net)) {
2105                                 continue;
2106                         }
2107                         rc = sk;
2108                         goto out;
2109                 }
2110                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2111                 inet_twsk_for_each(tw, node,
2112                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2113                         if (tw->tw_family != st->family ||
2114                             !net_eq(twsk_net(tw), net)) {
2115                                 continue;
2116                         }
2117                         rc = tw;
2118                         goto out;
2119                 }
2120                 spin_unlock_bh(lock);
2121                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2122         }
2123 out:
2124         return rc;
2125 }
2126
2127 static void *established_get_next(struct seq_file *seq, void *cur)
2128 {
2129         struct sock *sk = cur;
2130         struct inet_timewait_sock *tw;
2131         struct hlist_nulls_node *node;
2132         struct tcp_iter_state *st = seq->private;
2133         struct net *net = seq_file_net(seq);
2134
2135         ++st->num;
2136         ++st->offset;
2137
2138         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2139                 tw = cur;
2140                 tw = tw_next(tw);
2141 get_tw:
2142                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2143                         tw = tw_next(tw);
2144                 }
2145                 if (tw) {
2146                         cur = tw;
2147                         goto out;
2148                 }
2149                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2150                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2151
2152                 /* Look for next non empty bucket */
2153                 st->offset = 0;
2154                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2155                                 empty_bucket(st))
2156                         ;
2157                 if (st->bucket > tcp_hashinfo.ehash_mask)
2158                         return NULL;
2159
2160                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2161                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2162         } else
2163                 sk = sk_nulls_next(sk);
2164
2165         sk_nulls_for_each_from(sk, node) {
2166                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2167                         goto found;
2168         }
2169
2170         st->state = TCP_SEQ_STATE_TIME_WAIT;
2171         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2172         goto get_tw;
2173 found:
2174         cur = sk;
2175 out:
2176         return cur;
2177 }
2178
2179 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2180 {
2181         struct tcp_iter_state *st = seq->private;
2182         void *rc;
2183
2184         st->bucket = 0;
2185         rc = established_get_first(seq);
2186
2187         while (rc && pos) {
2188                 rc = established_get_next(seq, rc);
2189                 --pos;
2190         }
2191         return rc;
2192 }
2193
2194 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2195 {
2196         void *rc;
2197         struct tcp_iter_state *st = seq->private;
2198
2199         st->state = TCP_SEQ_STATE_LISTENING;
2200         rc        = listening_get_idx(seq, &pos);
2201
2202         if (!rc) {
2203                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2204                 rc        = established_get_idx(seq, pos);
2205         }
2206
2207         return rc;
2208 }
2209
2210 static void *tcp_seek_last_pos(struct seq_file *seq)
2211 {
2212         struct tcp_iter_state *st = seq->private;
2213         int offset = st->offset;
2214         int orig_num = st->num;
2215         void *rc = NULL;
2216
2217         switch (st->state) {
2218         case TCP_SEQ_STATE_OPENREQ:
2219         case TCP_SEQ_STATE_LISTENING:
2220                 if (st->bucket >= INET_LHTABLE_SIZE)
2221                         break;
2222                 st->state = TCP_SEQ_STATE_LISTENING;
2223                 rc = listening_get_next(seq, NULL);
2224                 while (offset-- && rc)
2225                         rc = listening_get_next(seq, rc);
2226                 if (rc)
2227                         break;
2228                 st->bucket = 0;
2229                 /* Fallthrough */
2230         case TCP_SEQ_STATE_ESTABLISHED:
2231         case TCP_SEQ_STATE_TIME_WAIT:
2232                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2233                 if (st->bucket > tcp_hashinfo.ehash_mask)
2234                         break;
2235                 rc = established_get_first(seq);
2236                 while (offset-- && rc)
2237                         rc = established_get_next(seq, rc);
2238         }
2239
2240         st->num = orig_num;
2241
2242         return rc;
2243 }
2244
2245 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2246 {
2247         struct tcp_iter_state *st = seq->private;
2248         void *rc;
2249
2250         if (*pos && *pos == st->last_pos) {
2251                 rc = tcp_seek_last_pos(seq);
2252                 if (rc)
2253                         goto out;
2254         }
2255
2256         st->state = TCP_SEQ_STATE_LISTENING;
2257         st->num = 0;
2258         st->bucket = 0;
2259         st->offset = 0;
2260         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2261
2262 out:
2263         st->last_pos = *pos;
2264         return rc;
2265 }
2266
2267 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2268 {
2269         struct tcp_iter_state *st = seq->private;
2270         void *rc = NULL;
2271
2272         if (v == SEQ_START_TOKEN) {
2273                 rc = tcp_get_idx(seq, 0);
2274                 goto out;
2275         }
2276
2277         switch (st->state) {
2278         case TCP_SEQ_STATE_OPENREQ:
2279         case TCP_SEQ_STATE_LISTENING:
2280                 rc = listening_get_next(seq, v);
2281                 if (!rc) {
2282                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2283                         st->bucket = 0;
2284                         st->offset = 0;
2285                         rc        = established_get_first(seq);
2286                 }
2287                 break;
2288         case TCP_SEQ_STATE_ESTABLISHED:
2289         case TCP_SEQ_STATE_TIME_WAIT:
2290                 rc = established_get_next(seq, v);
2291                 break;
2292         }
2293 out:
2294         ++*pos;
2295         st->last_pos = *pos;
2296         return rc;
2297 }
2298
2299 static void tcp_seq_stop(struct seq_file *seq, void *v)
2300 {
2301         struct tcp_iter_state *st = seq->private;
2302
2303         switch (st->state) {
2304         case TCP_SEQ_STATE_OPENREQ:
2305                 if (v) {
2306                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2307                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2308                 }
2309         case TCP_SEQ_STATE_LISTENING:
2310                 if (v != SEQ_START_TOKEN)
2311                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2312                 break;
2313         case TCP_SEQ_STATE_TIME_WAIT:
2314         case TCP_SEQ_STATE_ESTABLISHED:
2315                 if (v)
2316                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2317                 break;
2318         }
2319 }
2320
2321 int tcp_seq_open(struct inode *inode, struct file *file)
2322 {
2323         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2324         struct tcp_iter_state *s;
2325         int err;
2326
2327         err = seq_open_net(inode, file, &afinfo->seq_ops,
2328                           sizeof(struct tcp_iter_state));
2329         if (err < 0)
2330                 return err;
2331
2332         s = ((struct seq_file *)file->private_data)->private;
2333         s->family               = afinfo->family;
2334         s->last_pos             = 0;
2335         return 0;
2336 }
2337 EXPORT_SYMBOL(tcp_seq_open);
2338
2339 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2340 {
2341         int rc = 0;
2342         struct proc_dir_entry *p;
2343
2344         afinfo->seq_ops.start           = tcp_seq_start;
2345         afinfo->seq_ops.next            = tcp_seq_next;
2346         afinfo->seq_ops.stop            = tcp_seq_stop;
2347
2348         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2349                              afinfo->seq_fops, afinfo);
2350         if (!p)
2351                 rc = -ENOMEM;
2352         return rc;
2353 }
2354 EXPORT_SYMBOL(tcp_proc_register);
2355
2356 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2357 {
2358         proc_net_remove(net, afinfo->name);
2359 }
2360 EXPORT_SYMBOL(tcp_proc_unregister);
2361
2362 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2363                          struct seq_file *f, int i, int uid, int *len)
2364 {
2365         const struct inet_request_sock *ireq = inet_rsk(req);
2366         int ttd = req->expires - jiffies;
2367
2368         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2369                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2370                 i,
2371                 ireq->loc_addr,
2372                 ntohs(inet_sk(sk)->inet_sport),
2373                 ireq->rmt_addr,
2374                 ntohs(ireq->rmt_port),
2375                 TCP_SYN_RECV,
2376                 0, 0, /* could print option size, but that is af dependent. */
2377                 1,    /* timers active (only the expire timer) */
2378                 jiffies_to_clock_t(ttd),
2379                 req->retrans,
2380                 uid,
2381                 0,  /* non standard timer */
2382                 0, /* open_requests have no inode */
2383                 atomic_read(&sk->sk_refcnt),
2384                 req,
2385                 len);
2386 }
2387
2388 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2389 {
2390         int timer_active;
2391         unsigned long timer_expires;
2392         const struct tcp_sock *tp = tcp_sk(sk);
2393         const struct inet_connection_sock *icsk = inet_csk(sk);
2394         const struct inet_sock *inet = inet_sk(sk);
2395         __be32 dest = inet->inet_daddr;
2396         __be32 src = inet->inet_rcv_saddr;
2397         __u16 destp = ntohs(inet->inet_dport);
2398         __u16 srcp = ntohs(inet->inet_sport);
2399         int rx_queue;
2400
2401         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2402                 timer_active    = 1;
2403                 timer_expires   = icsk->icsk_timeout;
2404         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2405                 timer_active    = 4;
2406                 timer_expires   = icsk->icsk_timeout;
2407         } else if (timer_pending(&sk->sk_timer)) {
2408                 timer_active    = 2;
2409                 timer_expires   = sk->sk_timer.expires;
2410         } else {
2411                 timer_active    = 0;
2412                 timer_expires = jiffies;
2413         }
2414
2415         if (sk->sk_state == TCP_LISTEN)
2416                 rx_queue = sk->sk_ack_backlog;
2417         else
2418                 /*
2419                  * because we dont lock socket, we might find a transient negative value
2420                  */
2421                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2422
2423         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2424                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2425                 i, src, srcp, dest, destp, sk->sk_state,
2426                 tp->write_seq - tp->snd_una,
2427                 rx_queue,
2428                 timer_active,
2429                 jiffies_to_clock_t(timer_expires - jiffies),
2430                 icsk->icsk_retransmits,
2431                 sock_i_uid(sk),
2432                 icsk->icsk_probes_out,
2433                 sock_i_ino(sk),
2434                 atomic_read(&sk->sk_refcnt), sk,
2435                 jiffies_to_clock_t(icsk->icsk_rto),
2436                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2437                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2438                 tp->snd_cwnd,
2439                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2440                 len);
2441 }
2442
2443 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2444                                struct seq_file *f, int i, int *len)
2445 {
2446         __be32 dest, src;
2447         __u16 destp, srcp;
2448         int ttd = tw->tw_ttd - jiffies;
2449
2450         if (ttd < 0)
2451                 ttd = 0;
2452
2453         dest  = tw->tw_daddr;
2454         src   = tw->tw_rcv_saddr;
2455         destp = ntohs(tw->tw_dport);
2456         srcp  = ntohs(tw->tw_sport);
2457
2458         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2459                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2460                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2461                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2462                 atomic_read(&tw->tw_refcnt), tw, len);
2463 }
2464
2465 #define TMPSZ 150
2466
2467 static int tcp4_seq_show(struct seq_file *seq, void *v)
2468 {
2469         struct tcp_iter_state *st;
2470         int len;
2471
2472         if (v == SEQ_START_TOKEN) {
2473                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2474                            "  sl  local_address rem_address   st tx_queue "
2475                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2476                            "inode");
2477                 goto out;
2478         }
2479         st = seq->private;
2480
2481         switch (st->state) {
2482         case TCP_SEQ_STATE_LISTENING:
2483         case TCP_SEQ_STATE_ESTABLISHED:
2484                 get_tcp4_sock(v, seq, st->num, &len);
2485                 break;
2486         case TCP_SEQ_STATE_OPENREQ:
2487                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2488                 break;
2489         case TCP_SEQ_STATE_TIME_WAIT:
2490                 get_timewait4_sock(v, seq, st->num, &len);
2491                 break;
2492         }
2493         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2494 out:
2495         return 0;
2496 }
2497
2498 static const struct file_operations tcp_afinfo_seq_fops = {
2499         .owner   = THIS_MODULE,
2500         .open    = tcp_seq_open,
2501         .read    = seq_read,
2502         .llseek  = seq_lseek,
2503         .release = seq_release_net
2504 };
2505
2506 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2507         .name           = "tcp",
2508         .family         = AF_INET,
2509         .seq_fops       = &tcp_afinfo_seq_fops,
2510         .seq_ops        = {
2511                 .show           = tcp4_seq_show,
2512         },
2513 };
2514
2515 static int __net_init tcp4_proc_init_net(struct net *net)
2516 {
2517         return tcp_proc_register(net, &tcp4_seq_afinfo);
2518 }
2519
2520 static void __net_exit tcp4_proc_exit_net(struct net *net)
2521 {
2522         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2523 }
2524
2525 static struct pernet_operations tcp4_net_ops = {
2526         .init = tcp4_proc_init_net,
2527         .exit = tcp4_proc_exit_net,
2528 };
2529
2530 int __init tcp4_proc_init(void)
2531 {
2532         return register_pernet_subsys(&tcp4_net_ops);
2533 }
2534
2535 void tcp4_proc_exit(void)
2536 {
2537         unregister_pernet_subsys(&tcp4_net_ops);
2538 }
2539 #endif /* CONFIG_PROC_FS */
2540
2541 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2542 {
2543         const struct iphdr *iph = skb_gro_network_header(skb);
2544
2545         switch (skb->ip_summed) {
2546         case CHECKSUM_COMPLETE:
2547                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2548                                   skb->csum)) {
2549                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2550                         break;
2551                 }
2552
2553                 /* fall through */
2554         case CHECKSUM_NONE:
2555                 NAPI_GRO_CB(skb)->flush = 1;
2556                 return NULL;
2557         }
2558
2559         return tcp_gro_receive(head, skb);
2560 }
2561
2562 int tcp4_gro_complete(struct sk_buff *skb)
2563 {
2564         const struct iphdr *iph = ip_hdr(skb);
2565         struct tcphdr *th = tcp_hdr(skb);
2566
2567         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2568                                   iph->saddr, iph->daddr, 0);
2569         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2570
2571         return tcp_gro_complete(skb);
2572 }
2573
2574 struct proto tcp_prot = {
2575         .name                   = "TCP",
2576         .owner                  = THIS_MODULE,
2577         .close                  = tcp_close,
2578         .connect                = tcp_v4_connect,
2579         .disconnect             = tcp_disconnect,
2580         .accept                 = inet_csk_accept,
2581         .ioctl                  = tcp_ioctl,
2582         .init                   = tcp_v4_init_sock,
2583         .destroy                = tcp_v4_destroy_sock,
2584         .shutdown               = tcp_shutdown,
2585         .setsockopt             = tcp_setsockopt,
2586         .getsockopt             = tcp_getsockopt,
2587         .recvmsg                = tcp_recvmsg,
2588         .sendmsg                = tcp_sendmsg,
2589         .sendpage               = tcp_sendpage,
2590         .backlog_rcv            = tcp_v4_do_rcv,
2591         .release_cb             = tcp_release_cb,
2592         .hash                   = inet_hash,
2593         .unhash                 = inet_unhash,
2594         .get_port               = inet_csk_get_port,
2595         .enter_memory_pressure  = tcp_enter_memory_pressure,
2596         .sockets_allocated      = &tcp_sockets_allocated,
2597         .orphan_count           = &tcp_orphan_count,
2598         .memory_allocated       = &tcp_memory_allocated,
2599         .memory_pressure        = &tcp_memory_pressure,
2600         .sysctl_wmem            = sysctl_tcp_wmem,
2601         .sysctl_rmem            = sysctl_tcp_rmem,
2602         .max_header             = MAX_TCP_HEADER,
2603         .obj_size               = sizeof(struct tcp_sock),
2604         .slab_flags             = SLAB_DESTROY_BY_RCU,
2605         .twsk_prot              = &tcp_timewait_sock_ops,
2606         .rsk_prot               = &tcp_request_sock_ops,
2607         .h.hashinfo             = &tcp_hashinfo,
2608         .no_autobind            = true,
2609 #ifdef CONFIG_COMPAT
2610         .compat_setsockopt      = compat_tcp_setsockopt,
2611         .compat_getsockopt      = compat_tcp_getsockopt,
2612 #endif
2613 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2614         .init_cgroup            = tcp_init_cgroup,
2615         .destroy_cgroup         = tcp_destroy_cgroup,
2616         .proto_cgroup           = tcp_proto_cgroup,
2617 #endif
2618 };
2619 EXPORT_SYMBOL(tcp_prot);
2620
2621 static int __net_init tcp_sk_init(struct net *net)
2622 {
2623         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2624                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2625 }
2626
2627 static void __net_exit tcp_sk_exit(struct net *net)
2628 {
2629         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2630 }
2631
2632 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2633 {
2634         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2635 }
2636
2637 static struct pernet_operations __net_initdata tcp_sk_ops = {
2638        .init       = tcp_sk_init,
2639        .exit       = tcp_sk_exit,
2640        .exit_batch = tcp_sk_exit_batch,
2641 };
2642
2643 void __init tcp_v4_init(void)
2644 {
2645         inet_hashinfo_init(&tcp_hashinfo);
2646         if (register_pernet_subsys(&tcp_sk_ops))
2647                 panic("Failed to create the TCP control socket.\n");
2648 }