net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/tcp.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/xfrm.h>
  71
  72 #include <linux/inet.h>
  73 #include <linux/ipv6.h>
  74 #include <linux/stddef.h>
  75 #include <linux/proc_fs.h>
  76 #include <linux/seq_file.h>
  77
  78 extern int sysctl_ip_dynaddr;
  79 int sysctl_tcp_tw_reuse;
  80 int sysctl_tcp_low_latency;
  81
  82 /* Check TCP sequence numbers in ICMP packets. */
  83 #define ICMP_MIN_LENGTH 8
  84
  85 /* Socket used for sending RSTs */
  86 static struct socket *tcp_socket;
  87
  88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                        struct sk_buff *skb);
  90
  91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94         .__tcp_lhash_wait
  95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97 };
  98
  99 /*
 100  * This array holds the first and last local port number.
 101  * For high-usage systems, use sysctl to change this to
 102  * 32768-61000
 103  */
 104 int sysctl_local_port_range[2] = { 1024, 4999 };
 105 int tcp_port_rover = 1024 - 1;
 106
 107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                  __u32 faddr, __u16 fport)
 109 {
 110         int h = (laddr ^ lport) ^ (faddr ^ fport);
 111         h ^= h >> 16;
 112         h ^= h >> 8;
 113         return h & (tcp_ehash_size - 1);
 114 }
 115
 116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117 {
 118         struct inet_sock *inet = inet_sk(sk);
 119         __u32 laddr = inet->rcv_saddr;
 120         __u16 lport = inet->num;
 121         __u32 faddr = inet->daddr;
 122         __u16 fport = inet->dport;
 123
 124         return tcp_hashfn(laddr, lport, faddr, fport);
 125 }
 126
 127 /* Allocate and initialize a new TCP local port bind bucket.
 128  * The bindhash mutex for snum's hash chain must be held here.
 129  */
 130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                           unsigned short snum)
 132 {
 133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                       SLAB_ATOMIC);
 135         if (tb) {
 136                 tb->port = snum;
 137                 tb->fastreuse = 0;
 138                 INIT_HLIST_HEAD(&tb->owners);
 139                 hlist_add_head(&tb->node, &head->chain);
 140         }
 141         return tb;
 142 }
 143
 144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
 145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146 {
 147         if (hlist_empty(&tb->owners)) {
 148                 __hlist_del(&tb->node);
 149                 kmem_cache_free(tcp_bucket_cachep, tb);
 150         }
 151 }
 152
 153 /* Caller must disable local BH processing. */
 154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155 {
 156         struct tcp_bind_hashbucket *head =
 157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158         struct tcp_bind_bucket *tb;
 159
 160         spin_lock(&head->lock);
 161         tb = tcp_sk(sk)->bind_hash;
 162         sk_add_bind_node(child, &tb->owners);
 163         tcp_sk(child)->bind_hash = tb;
 164         spin_unlock(&head->lock);
 165 }
 166
 167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168 {
 169         local_bh_disable();
 170         __tcp_inherit_port(sk, child);
 171         local_bh_enable();
 172 }
 173
 174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                    unsigned short snum)
 176 {
 177         inet_sk(sk)->num = snum;
 178         sk_add_bind_node(sk, &tb->owners);
 179         tcp_sk(sk)->bind_hash = tb;
 180 }
 181
 182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183 {
 184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185         struct sock *sk2;
 186         struct hlist_node *node;
 187         int reuse = sk->sk_reuse;
 188
 189         sk_for_each_bound(sk2, node, &tb->owners) {
 190                 if (sk != sk2 &&
 191                     !tcp_v6_ipv6only(sk2) &&
 192                     (!sk->sk_bound_dev_if ||
 193                      !sk2->sk_bound_dev_if ||
 194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                         if (!reuse || !sk2->sk_reuse ||
 196                             sk2->sk_state == TCP_LISTEN) {
 197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                     sk2_rcv_saddr == sk_rcv_saddr)
 200                                         break;
 201                         }
 202                 }
 203         }
 204         return node != NULL;
 205 }
 206
 207 /* Obtain a reference to a local port for the given sock,
 208  * if snum is zero it means select any available local port.
 209  */
 210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211 {
 212         struct tcp_bind_hashbucket *head;
 213         struct hlist_node *node;
 214         struct tcp_bind_bucket *tb;
 215         int ret;
 216
 217         local_bh_disable();
 218         if (!snum) {
 219                 int low = sysctl_local_port_range[0];
 220                 int high = sysctl_local_port_range[1];
 221                 int remaining = (high - low) + 1;
 222                 int rover;
 223
 224                 spin_lock(&tcp_portalloc_lock);
 225                 if (tcp_port_rover < low)
 226                         rover = low;
 227                 else
 228                         rover = tcp_port_rover;
 229                 do {
 230                         rover++;
 231                         if (rover > high)
 232                                 rover = low;
 233                         head = &tcp_bhash[tcp_bhashfn(rover)];
 234                         spin_lock(&head->lock);
 235                         tb_for_each(tb, node, &head->chain)
 236                                 if (tb->port == rover)
 237                                         goto next;
 238                         break;
 239                 next:
 240                         spin_unlock(&head->lock);
 241                 } while (--remaining > 0);
 242                 tcp_port_rover = rover;
 243                 spin_unlock(&tcp_portalloc_lock);
 244
 245                 /* Exhausted local port range during search? */
 246                 ret = 1;
 247                 if (remaining <= 0)
 248                         goto fail;
 249
 250                 /* OK, here is the one we will use.  HEAD is
 251                  * non-NULL and we hold it's mutex.
 252                  */
 253                 snum = rover;
 254         } else {
 255                 head = &tcp_bhash[tcp_bhashfn(snum)];
 256                 spin_lock(&head->lock);
 257                 tb_for_each(tb, node, &head->chain)
 258                         if (tb->port == snum)
 259                                 goto tb_found;
 260         }
 261         tb = NULL;
 262         goto tb_not_found;
 263 tb_found:
 264         if (!hlist_empty(&tb->owners)) {
 265                 if (sk->sk_reuse > 1)
 266                         goto success;
 267                 if (tb->fastreuse > 0 &&
 268                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 269                         goto success;
 270                 } else {
 271                         ret = 1;
 272                         if (tcp_bind_conflict(sk, tb))
 273                                 goto fail_unlock;
 274                 }
 275         }
 276 tb_not_found:
 277         ret = 1;
 278         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 279                 goto fail_unlock;
 280         if (hlist_empty(&tb->owners)) {
 281                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 282                         tb->fastreuse = 1;
 283                 else
 284                         tb->fastreuse = 0;
 285         } else if (tb->fastreuse &&
 286                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 287                 tb->fastreuse = 0;
 288 success:
 289         if (!tcp_sk(sk)->bind_hash)
 290                 tcp_bind_hash(sk, tb, snum);
 291         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 292         ret = 0;
 293
 294 fail_unlock:
 295         spin_unlock(&head->lock);
 296 fail:
 297         local_bh_enable();
 298         return ret;
 299 }
 300
 301 /* Get rid of any references to a local port held by the
 302  * given sock.
 303  */
 304 static void __tcp_put_port(struct sock *sk)
 305 {
 306         struct inet_sock *inet = inet_sk(sk);
 307         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 308         struct tcp_bind_bucket *tb;
 309
 310         spin_lock(&head->lock);
 311         tb = tcp_sk(sk)->bind_hash;
 312         __sk_del_bind_node(sk);
 313         tcp_sk(sk)->bind_hash = NULL;
 314         inet->num = 0;
 315         tcp_bucket_destroy(tb);
 316         spin_unlock(&head->lock);
 317 }
 318
 319 void tcp_put_port(struct sock *sk)
 320 {
 321         local_bh_disable();
 322         __tcp_put_port(sk);
 323         local_bh_enable();
 324 }
 325
 326 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 327  * Look, when several writers sleep and reader wakes them up, all but one
 328  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 329  * this, _but_ remember, it adds useless work on UP machines (wake up each
 330  * exclusive lock release). It should be ifdefed really.
 331  */
 332
 333 void tcp_listen_wlock(void)
 334 {
 335         write_lock(&tcp_lhash_lock);
 336
 337         if (atomic_read(&tcp_lhash_users)) {
 338                 DEFINE_WAIT(wait);
 339
 340                 for (;;) {
 341                         prepare_to_wait_exclusive(&tcp_lhash_wait,
 342                                                 &wait, TASK_UNINTERRUPTIBLE);
 343                         if (!atomic_read(&tcp_lhash_users))
 344                                 break;
 345                         write_unlock_bh(&tcp_lhash_lock);
 346                         schedule();
 347                         write_lock_bh(&tcp_lhash_lock);
 348                 }
 349
 350                 finish_wait(&tcp_lhash_wait, &wait);
 351         }
 352 }
 353
 354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 355 {
 356         struct hlist_head *list;
 357         rwlock_t *lock;
 358
 359         BUG_TRAP(sk_unhashed(sk));
 360         if (listen_possible && sk->sk_state == TCP_LISTEN) {
 361                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 362                 lock = &tcp_lhash_lock;
 363                 tcp_listen_wlock();
 364         } else {
 365                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 366                 lock = &tcp_ehash[sk->sk_hashent].lock;
 367                 write_lock(lock);
 368         }
 369         __sk_add_node(sk, list);
 370         sock_prot_inc_use(sk->sk_prot);
 371         write_unlock(lock);
 372         if (listen_possible && sk->sk_state == TCP_LISTEN)
 373                 wake_up(&tcp_lhash_wait);
 374 }
 375
 376 static void tcp_v4_hash(struct sock *sk)
 377 {
 378         if (sk->sk_state != TCP_CLOSE) {
 379                 local_bh_disable();
 380                 __tcp_v4_hash(sk, 1);
 381                 local_bh_enable();
 382         }
 383 }
 384
 385 void tcp_unhash(struct sock *sk)
 386 {
 387         rwlock_t *lock;
 388
 389         if (sk_unhashed(sk))
 390                 goto ende;
 391
 392         if (sk->sk_state == TCP_LISTEN) {
 393                 local_bh_disable();
 394                 tcp_listen_wlock();
 395                 lock = &tcp_lhash_lock;
 396         } else {
 397                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 398                 lock = &head->lock;
 399                 write_lock_bh(&head->lock);
 400         }
 401
 402         if (__sk_del_node_init(sk))
 403                 sock_prot_dec_use(sk->sk_prot);
 404         write_unlock_bh(lock);
 405
 406  ende:
 407         if (sk->sk_state == TCP_LISTEN)
 408                 wake_up(&tcp_lhash_wait);
 409 }
 410
 411 /* Don't inline this cruft.  Here are some nice properties to
 412  * exploit here.  The BSD API does not allow a listening TCP
 413  * to specify the remote port nor the remote address for the
 414  * connection.  So always assume those are both wildcarded
 415  * during the search since they can never be otherwise.
 416  */
 417 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 418                                              unsigned short hnum, int dif)
 419 {
 420         struct sock *result = NULL, *sk;
 421         struct hlist_node *node;
 422         int score, hiscore;
 423
 424         hiscore=-1;
 425         sk_for_each(sk, node, head) {
 426                 struct inet_sock *inet = inet_sk(sk);
 427
 428                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
 429                         __u32 rcv_saddr = inet->rcv_saddr;
 430
 431                         score = (sk->sk_family == PF_INET ? 1 : 0);
 432                         if (rcv_saddr) {
 433                                 if (rcv_saddr != daddr)
 434                                         continue;
 435                                 score+=2;
 436                         }
 437                         if (sk->sk_bound_dev_if) {
 438                                 if (sk->sk_bound_dev_if != dif)
 439                                         continue;
 440                                 score+=2;
 441                         }
 442                         if (score == 5)
 443                                 return sk;
 444                         if (score > hiscore) {
 445                                 hiscore = score;
 446                                 result = sk;
 447                         }
 448                 }
 449         }
 450         return result;
 451 }
 452
 453 /* Optimize the common listener case. */
 454 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
 455                 unsigned short hnum, int dif)
 456 {
 457         struct sock *sk = NULL;
 458         struct hlist_head *head;
 459
 460         read_lock(&tcp_lhash_lock);
 461         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 462         if (!hlist_empty(head)) {
 463                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
 464
 465                 if (inet->num == hnum && !sk->sk_node.next &&
 466                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 467                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 468                     !sk->sk_bound_dev_if)
 469                         goto sherry_cache;
 470                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 471         }
 472         if (sk) {
 473 sherry_cache:
 474                 sock_hold(sk);
 475         }
 476         read_unlock(&tcp_lhash_lock);
 477         return sk;
 478 }
 479
 480 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 481  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 482  *
 483  * Local BH must be disabled here.
 484  */
 485
 486 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 487                                                        u32 daddr, u16 hnum,
 488                                                        int dif)
 489 {
 490         struct tcp_ehash_bucket *head;
 491         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 492         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 493         struct sock *sk;
 494         struct hlist_node *node;
 495         /* Optimize here for direct hit, only listening connections can
 496          * have wildcards anyways.
 497          */
 498         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 499         head = &tcp_ehash[hash];
 500         read_lock(&head->lock);
 501         sk_for_each(sk, node, &head->chain) {
 502                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 503                         goto hit; /* You sunk my battleship! */
 504         }
 505
 506         /* Must check for a TIME_WAIT'er before going to listener hash. */
 507         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 508                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 509                         goto hit;
 510         }
 511         sk = NULL;
 512 out:
 513         read_unlock(&head->lock);
 514         return sk;
 515 hit:
 516         sock_hold(sk);
 517         goto out;
 518 }
 519
 520 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 521                                            u32 daddr, u16 hnum, int dif)
 522 {
 523         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 524                                                       daddr, hnum, dif);
 525
 526         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 527 }
 528
 529 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 530                                   u16 dport, int dif)
 531 {
 532         struct sock *sk;
 533
 534         local_bh_disable();
 535         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 536         local_bh_enable();
 537
 538         return sk;
 539 }
 540
 541 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
 542
 543 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 544 {
 545         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 546                                           skb->nh.iph->saddr,
 547                                           skb->h.th->dest,
 548                                           skb->h.th->source);
 549 }
 550
 551 /* called with local bh disabled */
 552 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 553                                       struct tcp_tw_bucket **twp)
 554 {
 555         struct inet_sock *inet = inet_sk(sk);
 556         u32 daddr = inet->rcv_saddr;
 557         u32 saddr = inet->daddr;
 558         int dif = sk->sk_bound_dev_if;
 559         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 560         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 561         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 562         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 563         struct sock *sk2;
 564         struct hlist_node *node;
 565         struct tcp_tw_bucket *tw;
 566
 567         write_lock(&head->lock);
 568
 569         /* Check TIME-WAIT sockets first. */
 570         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 571                 tw = (struct tcp_tw_bucket *)sk2;
 572
 573                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 574                         struct tcp_sock *tp = tcp_sk(sk);
 575
 576                         /* With PAWS, it is safe from the viewpoint
 577                            of data integrity. Even without PAWS it
 578                            is safe provided sequence spaces do not
 579                            overlap i.e. at data rates <= 80Mbit/sec.
 580
 581                            Actually, the idea is close to VJ's one,
 582                            only timestamp cache is held not per host,
 583                            but per port pair and TW bucket is used
 584                            as state holder.
 585
 586                            If TW bucket has been already destroyed we
 587                            fall back to VJ's scheme and use initial
 588                            timestamp retrieved from peer table.
 589                          */
 590                         if (tw->tw_ts_recent_stamp &&
 591                             (!twp || (sysctl_tcp_tw_reuse &&
 592                                       xtime.tv_sec -
 593                                       tw->tw_ts_recent_stamp > 1))) {
 594                                 if ((tp->write_seq =
 595                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
 596                                         tp->write_seq = 1;
 597                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
 598                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
 599                                 sock_hold(sk2);
 600                                 goto unique;
 601                         } else
 602                                 goto not_unique;
 603                 }
 604         }
 605         tw = NULL;
 606
 607         /* And established part... */
 608         sk_for_each(sk2, node, &head->chain) {
 609                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 610                         goto not_unique;
 611         }
 612
 613 unique:
 614         /* Must record num and sport now. Otherwise we will see
 615          * in hash table socket with a funny identity. */
 616         inet->num = lport;
 617         inet->sport = htons(lport);
 618         sk->sk_hashent = hash;
 619         BUG_TRAP(sk_unhashed(sk));
 620         __sk_add_node(sk, &head->chain);
 621         sock_prot_inc_use(sk->sk_prot);
 622         write_unlock(&head->lock);
 623
 624         if (twp) {
 625                 *twp = tw;
 626                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 627         } else if (tw) {
 628                 /* Silly. Should hash-dance instead... */
 629                 tcp_tw_deschedule(tw);
 630                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 631
 632                 tcp_tw_put(tw);
 633         }
 634
 635         return 0;
 636
 637 not_unique:
 638         write_unlock(&head->lock);
 639         return -EADDRNOTAVAIL;
 640 }
 641
 642 static inline u32 connect_port_offset(const struct sock *sk)
 643 {
 644         const struct inet_sock *inet = inet_sk(sk);
 645
 646         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 647                                          inet->dport);
 648 }
 649
 650 /*
 651  * Bind a port for a connect operation and hash it.
 652  */
 653 static inline int tcp_v4_hash_connect(struct sock *sk)
 654 {
 655         unsigned short snum = inet_sk(sk)->num;
 656         struct tcp_bind_hashbucket *head;
 657         struct tcp_bind_bucket *tb;
 658         int ret;
 659
 660         if (!snum) {
 661                 int low = sysctl_local_port_range[0];
 662                 int high = sysctl_local_port_range[1];
 663                 int range = high - low;
 664                 int i;
 665                 int port;
 666                 static u32 hint;
 667                 u32 offset = hint + connect_port_offset(sk);
 668                 struct hlist_node *node;
 669                 struct tcp_tw_bucket *tw = NULL;
 670
 671                 local_bh_disable();
 672                 for (i = 1; i <= range; i++) {
 673                         port = low + (i + offset) % range;
 674                         head = &tcp_bhash[tcp_bhashfn(port)];
 675                         spin_lock(&head->lock);
 676
 677                         /* Does not bother with rcv_saddr checks,
 678                          * because the established check is already
 679                          * unique enough.
 680                          */
 681                         tb_for_each(tb, node, &head->chain) {
 682                                 if (tb->port == port) {
 683                                         BUG_TRAP(!hlist_empty(&tb->owners));
 684                                         if (tb->fastreuse >= 0)
 685                                                 goto next_port;
 686                                         if (!__tcp_v4_check_established(sk,
 687                                                                         port,
 688                                                                         &tw))
 689                                                 goto ok;
 690                                         goto next_port;
 691                                 }
 692                         }
 693
 694                         tb = tcp_bucket_create(head, port);
 695                         if (!tb) {
 696                                 spin_unlock(&head->lock);
 697                                 break;
 698                         }
 699                         tb->fastreuse = -1;
 700                         goto ok;
 701
 702                 next_port:
 703                         spin_unlock(&head->lock);
 704                 }
 705                 local_bh_enable();
 706
 707                 return -EADDRNOTAVAIL;
 708
 709 ok:
 710                 hint += i;
 711
 712                 /* Head lock still held and bh's disabled */
 713                 tcp_bind_hash(sk, tb, port);
 714                 if (sk_unhashed(sk)) {
 715                         inet_sk(sk)->sport = htons(port);
 716                         __tcp_v4_hash(sk, 0);
 717                 }
 718                 spin_unlock(&head->lock);
 719
 720                 if (tw) {
 721                         tcp_tw_deschedule(tw);
 722                         tcp_tw_put(tw);
 723                 }
 724
 725                 ret = 0;
 726                 goto out;
 727         }
 728
 729         head  = &tcp_bhash[tcp_bhashfn(snum)];
 730         tb  = tcp_sk(sk)->bind_hash;
 731         spin_lock_bh(&head->lock);
 732         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 733                 __tcp_v4_hash(sk, 0);
 734                 spin_unlock_bh(&head->lock);
 735                 return 0;
 736         } else {
 737                 spin_unlock(&head->lock);
 738                 /* No definite answer... Walk to established hash table */
 739                 ret = __tcp_v4_check_established(sk, snum, NULL);
 740 out:
 741                 local_bh_enable();
 742                 return ret;
 743         }
 744 }
 745
 746 /* This will initiate an outgoing connection. */
 747 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 748 {
 749         struct inet_sock *inet = inet_sk(sk);
 750         struct tcp_sock *tp = tcp_sk(sk);
 751         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 752         struct rtable *rt;
 753         u32 daddr, nexthop;
 754         int tmp;
 755         int err;
 756
 757         if (addr_len < sizeof(struct sockaddr_in))
 758                 return -EINVAL;
 759
 760         if (usin->sin_family != AF_INET)
 761                 return -EAFNOSUPPORT;
 762
 763         nexthop = daddr = usin->sin_addr.s_addr;
 764         if (inet->opt && inet->opt->srr) {
 765                 if (!daddr)
 766                         return -EINVAL;
 767                 nexthop = inet->opt->faddr;
 768         }
 769
 770         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 771                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 772                                IPPROTO_TCP,
 773                                inet->sport, usin->sin_port, sk);
 774         if (tmp < 0)
 775                 return tmp;
 776
 777         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 778                 ip_rt_put(rt);
 779                 return -ENETUNREACH;
 780         }
 781
 782         if (!inet->opt || !inet->opt->srr)
 783                 daddr = rt->rt_dst;
 784
 785         if (!inet->saddr)
 786                 inet->saddr = rt->rt_src;
 787         inet->rcv_saddr = inet->saddr;
 788
 789         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 790                 /* Reset inherited state */
 791                 tp->rx_opt.ts_recent       = 0;
 792                 tp->rx_opt.ts_recent_stamp = 0;
 793                 tp->write_seq              = 0;
 794         }
 795
 796         if (sysctl_tcp_tw_recycle &&
 797             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 798                 struct inet_peer *peer = rt_get_peer(rt);
 799
 800                 /* VJ's idea. We save last timestamp seen from
 801                  * the destination in peer table, when entering state TIME-WAIT
 802                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 803                  */
 804
 805                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 806                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 807                         tp->rx_opt.ts_recent = peer->tcp_ts;
 808                 }
 809         }
 810
 811         inet->dport = usin->sin_port;
 812         inet->daddr = daddr;
 813
 814         tp->ext_header_len = 0;
 815         if (inet->opt)
 816                 tp->ext_header_len = inet->opt->optlen;
 817
 818         tp->rx_opt.mss_clamp = 536;
 819
 820         /* Socket identity is still unknown (sport may be zero).
 821          * However we set state to SYN-SENT and not releasing socket
 822          * lock select source port, enter ourselves into the hash tables and
 823          * complete initialization after this.
 824          */
 825         tcp_set_state(sk, TCP_SYN_SENT);
 826         err = tcp_v4_hash_connect(sk);
 827         if (err)
 828                 goto failure;
 829
 830         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 831         if (err)
 832                 goto failure;
 833
 834         /* OK, now commit destination to socket.  */
 835         __sk_dst_set(sk, &rt->u.dst);
 836         tcp_v4_setup_caps(sk, &rt->u.dst);
 837
 838         if (!tp->write_seq)
 839                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 840                                                            inet->daddr,
 841                                                            inet->sport,
 842                                                            usin->sin_port);
 843
 844         inet->id = tp->write_seq ^ jiffies;
 845
 846         err = tcp_connect(sk);
 847         rt = NULL;
 848         if (err)
 849                 goto failure;
 850
 851         return 0;
 852
 853 failure:
 854         /* This unhashes the socket and releases the local port, if necessary. */
 855         tcp_set_state(sk, TCP_CLOSE);
 856         ip_rt_put(rt);
 857         sk->sk_route_caps = 0;
 858         inet->dport = 0;
 859         return err;
 860 }
 861
 862 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 863 {
 864         return ((struct rtable *)skb->dst)->rt_iif;
 865 }
 866
 867 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 868 {
 869         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 870 }
 871
 872 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 873                                               struct request_sock ***prevp,
 874                                               __u16 rport,
 875                                               __u32 raddr, __u32 laddr)
 876 {
 877         struct tcp_listen_opt *lopt = tp->listen_opt;
 878         struct request_sock *req, **prev;
 879
 880         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 881              (req = *prev) != NULL;
 882              prev = &req->dl_next) {
 883                 const struct inet_request_sock *ireq = inet_rsk(req);
 884
 885                 if (ireq->rmt_port == rport &&
 886                     ireq->rmt_addr == raddr &&
 887                     ireq->loc_addr == laddr &&
 888                     TCP_INET_FAMILY(req->rsk_ops->family)) {
 889                         BUG_TRAP(!req->sk);
 890                         *prevp = prev;
 891                         break;
 892                 }
 893         }
 894
 895         return req;
 896 }
 897
 898 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 899 {
 900         struct tcp_sock *tp = tcp_sk(sk);
 901         struct tcp_listen_opt *lopt = tp->listen_opt;
 902         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 903
 904         req->expires = jiffies + TCP_TIMEOUT_INIT;
 905         req->retrans = 0;
 906         req->sk = NULL;
 907         req->dl_next = lopt->syn_table[h];
 908
 909         write_lock(&tp->syn_wait_lock);
 910         lopt->syn_table[h] = req;
 911         write_unlock(&tp->syn_wait_lock);
 912
 913         tcp_synq_added(sk);
 914 }
 915
 916
 917 /*
 918  * This routine does path mtu discovery as defined in RFC1191.
 919  */
 920 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 921                                      u32 mtu)
 922 {
 923         struct dst_entry *dst;
 924         struct inet_sock *inet = inet_sk(sk);
 925         struct tcp_sock *tp = tcp_sk(sk);
 926
 927         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 928          * send out by Linux are always <576bytes so they should go through
 929          * unfragmented).
 930          */
 931         if (sk->sk_state == TCP_LISTEN)
 932                 return;
 933
 934         /* We don't check in the destentry if pmtu discovery is forbidden
 935          * on this route. We just assume that no packet_to_big packets
 936          * are send back when pmtu discovery is not active.
 937          * There is a small race when the user changes this flag in the
 938          * route, but I think that's acceptable.
 939          */
 940         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 941                 return;
 942
 943         dst->ops->update_pmtu(dst, mtu);
 944
 945         /* Something is about to be wrong... Remember soft error
 946          * for the case, if this connection will not able to recover.
 947          */
 948         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 949                 sk->sk_err_soft = EMSGSIZE;
 950
 951         mtu = dst_mtu(dst);
 952
 953         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 954             tp->pmtu_cookie > mtu) {
 955                 tcp_sync_mss(sk, mtu);
 956
 957                 /* Resend the TCP packet because it's
 958                  * clear that the old packet has been
 959                  * dropped. This is the new "fast" path mtu
 960                  * discovery.
 961                  */
 962                 tcp_simple_retransmit(sk);
 963         } /* else let the usual retransmit timer handle it */
 964 }
 965
 966 /*
 967  * This routine is called by the ICMP module when it gets some
 968  * sort of error condition.  If err < 0 then the socket should
 969  * be closed and the error returned to the user.  If err > 0
 970  * it's just the icmp type << 8 | icmp code.  After adjustment
 971  * header points to the first 8 bytes of the tcp header.  We need
 972  * to find the appropriate port.
 973  *
 974  * The locking strategy used here is very "optimistic". When
 975  * someone else accesses the socket the ICMP is just dropped
 976  * and for some paths there is no check at all.
 977  * A more general error queue to queue errors for later handling
 978  * is probably better.
 979  *
 980  */
 981
 982 void tcp_v4_err(struct sk_buff *skb, u32 info)
 983 {
 984         struct iphdr *iph = (struct iphdr *)skb->data;
 985         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 986         struct tcp_sock *tp;
 987         struct inet_sock *inet;
 988         int type = skb->h.icmph->type;
 989         int code = skb->h.icmph->code;
 990         struct sock *sk;
 991         __u32 seq;
 992         int err;
 993
 994         if (skb->len < (iph->ihl << 2) + 8) {
 995                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 996                 return;
 997         }
 998
 999         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1000                            th->source, tcp_v4_iif(skb));
1001         if (!sk) {
1002                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1003                 return;
1004         }
1005         if (sk->sk_state == TCP_TIME_WAIT) {
1006                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1007                 return;
1008         }
1009
1010         bh_lock_sock(sk);
1011         /* If too many ICMPs get dropped on busy
1012          * servers this needs to be solved differently.
1013          */
1014         if (sock_owned_by_user(sk))
1015                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1016
1017         if (sk->sk_state == TCP_CLOSE)
1018                 goto out;
1019
1020         tp = tcp_sk(sk);
1021         seq = ntohl(th->seq);
1022         if (sk->sk_state != TCP_LISTEN &&
1023             !between(seq, tp->snd_una, tp->snd_nxt)) {
1024                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1025                 goto out;
1026         }
1027
1028         switch (type) {
1029         case ICMP_SOURCE_QUENCH:
1030                 /* Just silently ignore these. */
1031                 goto out;
1032         case ICMP_PARAMETERPROB:
1033                 err = EPROTO;
1034                 break;
1035         case ICMP_DEST_UNREACH:
1036                 if (code > NR_ICMP_UNREACH)
1037                         goto out;
1038
1039                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1040                         if (!sock_owned_by_user(sk))
1041                                 do_pmtu_discovery(sk, iph, info);
1042                         goto out;
1043                 }
1044
1045                 err = icmp_err_convert[code].errno;
1046                 break;
1047         case ICMP_TIME_EXCEEDED:
1048                 err = EHOSTUNREACH;
1049                 break;
1050         default:
1051                 goto out;
1052         }
1053
1054         switch (sk->sk_state) {
1055                 struct request_sock *req, **prev;
1056         case TCP_LISTEN:
1057                 if (sock_owned_by_user(sk))
1058                         goto out;
1059
1060                 req = tcp_v4_search_req(tp, &prev, th->dest,
1061                                         iph->daddr, iph->saddr);
1062                 if (!req)
1063                         goto out;
1064
1065                 /* ICMPs are not backlogged, hence we cannot get
1066                    an established socket here.
1067                  */
1068                 BUG_TRAP(!req->sk);
1069
1070                 if (seq != tcp_rsk(req)->snt_isn) {
1071                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1072                         goto out;
1073                 }
1074
1075                 /*
1076                  * Still in SYN_RECV, just remove it silently.
1077                  * There is no good way to pass the error to the newly
1078                  * created socket, and POSIX does not want network
1079                  * errors returned from accept().
1080                  */
1081                 tcp_synq_drop(sk, req, prev);
1082                 goto out;
1083
1084         case TCP_SYN_SENT:
1085         case TCP_SYN_RECV:  /* Cannot happen.
1086                                It can f.e. if SYNs crossed.
1087                              */
1088                 if (!sock_owned_by_user(sk)) {
1089                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1090                         sk->sk_err = err;
1091
1092                         sk->sk_error_report(sk);
1093
1094                         tcp_done(sk);
1095                 } else {
1096                         sk->sk_err_soft = err;
1097                 }
1098                 goto out;
1099         }
1100
1101         /* If we've already connected we will keep trying
1102          * until we time out, or the user gives up.
1103          *
1104          * rfc1122 4.2.3.9 allows to consider as hard errors
1105          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1106          * but it is obsoleted by pmtu discovery).
1107          *
1108          * Note, that in modern internet, where routing is unreliable
1109          * and in each dark corner broken firewalls sit, sending random
1110          * errors ordered by their masters even this two messages finally lose
1111          * their original sense (even Linux sends invalid PORT_UNREACHs)
1112          *
1113          * Now we are in compliance with RFCs.
1114          *                                                      --ANK (980905)
1115          */
1116
1117         inet = inet_sk(sk);
1118         if (!sock_owned_by_user(sk) && inet->recverr) {
1119                 sk->sk_err = err;
1120                 sk->sk_error_report(sk);
1121         } else  { /* Only an error on timeout */
1122                 sk->sk_err_soft = err;
1123         }
1124
1125 out:
1126         bh_unlock_sock(sk);
1127         sock_put(sk);
1128 }
1129
1130 /* This routine computes an IPv4 TCP checksum. */
1131 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1132                        struct sk_buff *skb)
1133 {
1134         struct inet_sock *inet = inet_sk(sk);
1135
1136         if (skb->ip_summed == CHECKSUM_HW) {
1137                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1138                 skb->csum = offsetof(struct tcphdr, check);
1139         } else {
1140                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1141                                          csum_partial((char *)th,
1142                                                       th->doff << 2,
1143                                                       skb->csum));
1144         }
1145 }
1146
1147 /*
1148  *      This routine will send an RST to the other tcp.
1149  *
1150  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1151  *                    for reset.
1152  *      Answer: if a packet caused RST, it is not for a socket
1153  *              existing in our system, if it is matched to a socket,
1154  *              it is just duplicate segment or bug in other side's TCP.
1155  *              So that we build reply only basing on parameters
1156  *              arrived with segment.
1157  *      Exception: precedence violation. We do not implement it in any case.
1158  */
1159
1160 static void tcp_v4_send_reset(struct sk_buff *skb)
1161 {
1162         struct tcphdr *th = skb->h.th;
1163         struct tcphdr rth;
1164         struct ip_reply_arg arg;
1165
1166         /* Never send a reset in response to a reset. */
1167         if (th->rst)
1168                 return;
1169
1170         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1171                 return;
1172
1173         /* Swap the send and the receive. */
1174         memset(&rth, 0, sizeof(struct tcphdr));
1175         rth.dest   = th->source;
1176         rth.source = th->dest;
1177         rth.doff   = sizeof(struct tcphdr) / 4;
1178         rth.rst    = 1;
1179
1180         if (th->ack) {
1181                 rth.seq = th->ack_seq;
1182         } else {
1183                 rth.ack = 1;
1184                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1185                                     skb->len - (th->doff << 2));
1186         }
1187
1188         memset(&arg, 0, sizeof arg);
1189         arg.iov[0].iov_base = (unsigned char *)&rth;
1190         arg.iov[0].iov_len  = sizeof rth;
1191         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1192                                       skb->nh.iph->saddr, /*XXX*/
1193                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1194         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1195
1196         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1197
1198         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1199         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1200 }
1201
1202 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1203    outside socket context is ugly, certainly. What can I do?
1204  */
1205
1206 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1207                             u32 win, u32 ts)
1208 {
1209         struct tcphdr *th = skb->h.th;
1210         struct {
1211                 struct tcphdr th;
1212                 u32 tsopt[3];
1213         } rep;
1214         struct ip_reply_arg arg;
1215
1216         memset(&rep.th, 0, sizeof(struct tcphdr));
1217         memset(&arg, 0, sizeof arg);
1218
1219         arg.iov[0].iov_base = (unsigned char *)&rep;
1220         arg.iov[0].iov_len  = sizeof(rep.th);
1221         if (ts) {
1222                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1223                                      (TCPOPT_TIMESTAMP << 8) |
1224                                      TCPOLEN_TIMESTAMP);
1225                 rep.tsopt[1] = htonl(tcp_time_stamp);
1226                 rep.tsopt[2] = htonl(ts);
1227                 arg.iov[0].iov_len = sizeof(rep);
1228         }
1229
1230         /* Swap the send and the receive. */
1231         rep.th.dest    = th->source;
1232         rep.th.source  = th->dest;
1233         rep.th.doff    = arg.iov[0].iov_len / 4;
1234         rep.th.seq     = htonl(seq);
1235         rep.th.ack_seq = htonl(ack);
1236         rep.th.ack     = 1;
1237         rep.th.window  = htons(win);
1238
1239         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1240                                       skb->nh.iph->saddr, /*XXX*/
1241                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1242         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1243
1244         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1245
1246         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1247 }
1248
1249 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1250 {
1251         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1252
1253         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1254                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1255
1256         tcp_tw_put(tw);
1257 }
1258
1259 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1260 {
1261         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1262                         req->ts_recent);
1263 }
1264
1265 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1266                                           struct request_sock *req)
1267 {
1268         struct rtable *rt;
1269         const struct inet_request_sock *ireq = inet_rsk(req);
1270         struct ip_options *opt = inet_rsk(req)->opt;
1271         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1272                             .nl_u = { .ip4_u =
1273                                       { .daddr = ((opt && opt->srr) ?
1274                                                   opt->faddr :
1275                                                   ireq->rmt_addr),
1276                                         .saddr = ireq->loc_addr,
1277                                         .tos = RT_CONN_FLAGS(sk) } },
1278                             .proto = IPPROTO_TCP,
1279                             .uli_u = { .ports =
1280                                        { .sport = inet_sk(sk)->sport,
1281                                          .dport = ireq->rmt_port } } };
1282
1283         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1284                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1285                 return NULL;
1286         }
1287         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1288                 ip_rt_put(rt);
1289                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1290                 return NULL;
1291         }
1292         return &rt->u.dst;
1293 }
1294
1295 /*
1296  *      Send a SYN-ACK after having received an ACK.
1297  *      This still operates on a request_sock only, not on a big
1298  *      socket.
1299  */
1300 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1301                               struct dst_entry *dst)
1302 {
1303         const struct inet_request_sock *ireq = inet_rsk(req);
1304         int err = -1;
1305         struct sk_buff * skb;
1306
1307         /* First, grab a route. */
1308         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1309                 goto out;
1310
1311         skb = tcp_make_synack(sk, dst, req);
1312
1313         if (skb) {
1314                 struct tcphdr *th = skb->h.th;
1315
1316                 th->check = tcp_v4_check(th, skb->len,
1317                                          ireq->loc_addr,
1318                                          ireq->rmt_addr,
1319                                          csum_partial((char *)th, skb->len,
1320                                                       skb->csum));
1321
1322                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1323                                             ireq->rmt_addr,
1324                                             ireq->opt);
1325                 if (err == NET_XMIT_CN)
1326                         err = 0;
1327         }
1328
1329 out:
1330         dst_release(dst);
1331         return err;
1332 }
1333
1334 /*
1335  *      IPv4 request_sock destructor.
1336  */
1337 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1338 {
1339         if (inet_rsk(req)->opt)
1340                 kfree(inet_rsk(req)->opt);
1341 }
1342
1343 static inline void syn_flood_warning(struct sk_buff *skb)
1344 {
1345         static unsigned long warntime;
1346
1347         if (time_after(jiffies, (warntime + HZ * 60))) {
1348                 warntime = jiffies;
1349                 printk(KERN_INFO
1350                        "possible SYN flooding on port %d. Sending cookies.\n",
1351                        ntohs(skb->h.th->dest));
1352         }
1353 }
1354
1355 /*
1356  * Save and compile IPv4 options into the request_sock if needed.
1357  */
1358 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1359                                                      struct sk_buff *skb)
1360 {
1361         struct ip_options *opt = &(IPCB(skb)->opt);
1362         struct ip_options *dopt = NULL;
1363
1364         if (opt && opt->optlen) {
1365                 int opt_size = optlength(opt);
1366                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1367                 if (dopt) {
1368                         if (ip_options_echo(dopt, skb)) {
1369                                 kfree(dopt);
1370                                 dopt = NULL;
1371                         }
1372                 }
1373         }
1374         return dopt;
1375 }
1376
1377 /*
1378  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1379  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1380  * It would be better to replace it with a global counter for all sockets
1381  * but then some measure against one socket starving all other sockets
1382  * would be needed.
1383  *
1384  * It was 128 by default. Experiments with real servers show, that
1385  * it is absolutely not enough even at 100conn/sec. 256 cures most
1386  * of problems. This value is adjusted to 128 for very small machines
1387  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1388  * Further increasing requires to change hash table size.
1389  */
1390 int sysctl_max_syn_backlog = 256;
1391
1392 struct request_sock_ops tcp_request_sock_ops = {
1393         .family         =       PF_INET,
1394         .obj_size       =       sizeof(struct tcp_request_sock),
1395         .rtx_syn_ack    =       tcp_v4_send_synack,
1396         .send_ack       =       tcp_v4_reqsk_send_ack,
1397         .destructor     =       tcp_v4_reqsk_destructor,
1398         .send_reset     =       tcp_v4_send_reset,
1399 };
1400
1401 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1402 {
1403         struct inet_request_sock *ireq;
1404         struct tcp_options_received tmp_opt;
1405         struct request_sock *req;
1406         __u32 saddr = skb->nh.iph->saddr;
1407         __u32 daddr = skb->nh.iph->daddr;
1408         __u32 isn = TCP_SKB_CB(skb)->when;
1409         struct dst_entry *dst = NULL;
1410 #ifdef CONFIG_SYN_COOKIES
1411         int want_cookie = 0;
1412 #else
1413 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1414 #endif
1415
1416         /* Never answer to SYNs send to broadcast or multicast */
1417         if (((struct rtable *)skb->dst)->rt_flags &
1418             (RTCF_BROADCAST | RTCF_MULTICAST))
1419                 goto drop;
1420
1421         /* TW buckets are converted to open requests without
1422          * limitations, they conserve resources and peer is
1423          * evidently real one.
1424          */
1425         if (tcp_synq_is_full(sk) && !isn) {
1426 #ifdef CONFIG_SYN_COOKIES
1427                 if (sysctl_tcp_syncookies) {
1428                         want_cookie = 1;
1429                 } else
1430 #endif
1431                 goto drop;
1432         }
1433
1434         /* Accept backlog is full. If we have already queued enough
1435          * of warm entries in syn queue, drop request. It is better than
1436          * clogging syn queue with openreqs with exponentially increasing
1437          * timeout.
1438          */
1439         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1440                 goto drop;
1441
1442         req = reqsk_alloc(&tcp_request_sock_ops);
1443         if (!req)
1444                 goto drop;
1445
1446         tcp_clear_options(&tmp_opt);
1447         tmp_opt.mss_clamp = 536;
1448         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1449
1450         tcp_parse_options(skb, &tmp_opt, 0);
1451
1452         if (want_cookie) {
1453                 tcp_clear_options(&tmp_opt);
1454                 tmp_opt.saw_tstamp = 0;
1455         }
1456
1457         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1458                 /* Some OSes (unknown ones, but I see them on web server, which
1459                  * contains information interesting only for windows'
1460                  * users) do not send their stamp in SYN. It is easy case.
1461                  * We simply do not advertise TS support.
1462                  */
1463                 tmp_opt.saw_tstamp = 0;
1464                 tmp_opt.tstamp_ok  = 0;
1465         }
1466         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1467
1468         tcp_openreq_init(req, &tmp_opt, skb);
1469
1470         ireq = inet_rsk(req);
1471         ireq->loc_addr = daddr;
1472         ireq->rmt_addr = saddr;
1473         ireq->opt = tcp_v4_save_options(sk, skb);
1474         if (!want_cookie)
1475                 TCP_ECN_create_request(req, skb->h.th);
1476
1477         if (want_cookie) {
1478 #ifdef CONFIG_SYN_COOKIES
1479                 syn_flood_warning(skb);
1480 #endif
1481                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1482         } else if (!isn) {
1483                 struct inet_peer *peer = NULL;
1484
1485                 /* VJ's idea. We save last timestamp seen
1486                  * from the destination in peer table, when entering
1487                  * state TIME-WAIT, and check against it before
1488                  * accepting new connection request.
1489                  *
1490                  * If "isn" is not zero, this request hit alive
1491                  * timewait bucket, so that all the necessary checks
1492                  * are made in the function processing timewait state.
1493                  */
1494                 if (tmp_opt.saw_tstamp &&
1495                     sysctl_tcp_tw_recycle &&
1496                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1497                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1498                     peer->v4daddr == saddr) {
1499                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1500                             (s32)(peer->tcp_ts - req->ts_recent) >
1501                                                         TCP_PAWS_WINDOW) {
1502                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1503                                 dst_release(dst);
1504                                 goto drop_and_free;
1505                         }
1506                 }
1507                 /* Kill the following clause, if you dislike this way. */
1508                 else if (!sysctl_tcp_syncookies &&
1509                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1510                           (sysctl_max_syn_backlog >> 2)) &&
1511                          (!peer || !peer->tcp_ts_stamp) &&
1512                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1513                         /* Without syncookies last quarter of
1514                          * backlog is filled with destinations,
1515                          * proven to be alive.
1516                          * It means that we continue to communicate
1517                          * to destinations, already remembered
1518                          * to the moment of synflood.
1519                          */
1520                         NETDEBUG(if (net_ratelimit()) \
1521                                         printk(KERN_DEBUG "TCP: drop open "
1522                                                           "request from %u.%u."
1523                                                           "%u.%u/%u\n", \
1524                                                NIPQUAD(saddr),
1525                                                ntohs(skb->h.th->source)));
1526                         dst_release(dst);
1527                         goto drop_and_free;
1528                 }
1529
1530                 isn = tcp_v4_init_sequence(sk, skb);
1531         }
1532         tcp_rsk(req)->snt_isn = isn;
1533
1534         if (tcp_v4_send_synack(sk, req, dst))
1535                 goto drop_and_free;
1536
1537         if (want_cookie) {
1538                 reqsk_free(req);
1539         } else {
1540                 tcp_v4_synq_add(sk, req);
1541         }
1542         return 0;
1543
1544 drop_and_free:
1545         reqsk_free(req);
1546 drop:
1547         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1548         return 0;
1549 }
1550
1551
1552 /*
1553  * The three way handshake has completed - we got a valid synack -
1554  * now create the new socket.
1555  */
1556 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1557                                   struct request_sock *req,
1558                                   struct dst_entry *dst)
1559 {
1560         struct inet_request_sock *ireq;
1561         struct inet_sock *newinet;
1562         struct tcp_sock *newtp;
1563         struct sock *newsk;
1564
1565         if (sk_acceptq_is_full(sk))
1566                 goto exit_overflow;
1567
1568         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1569                 goto exit;
1570
1571         newsk = tcp_create_openreq_child(sk, req, skb);
1572         if (!newsk)
1573                 goto exit;
1574
1575         newsk->sk_dst_cache = dst;
1576         tcp_v4_setup_caps(newsk, dst);
1577
1578         newtp                 = tcp_sk(newsk);
1579         newinet               = inet_sk(newsk);
1580         ireq                  = inet_rsk(req);
1581         newinet->daddr        = ireq->rmt_addr;
1582         newinet->rcv_saddr    = ireq->loc_addr;
1583         newinet->saddr        = ireq->loc_addr;
1584         newinet->opt          = ireq->opt;
1585         ireq->opt             = NULL;
1586         newinet->mc_index     = tcp_v4_iif(skb);
1587         newinet->mc_ttl       = skb->nh.iph->ttl;
1588         newtp->ext_header_len = 0;
1589         if (newinet->opt)
1590                 newtp->ext_header_len = newinet->opt->optlen;
1591         newinet->id = newtp->write_seq ^ jiffies;
1592
1593         tcp_sync_mss(newsk, dst_mtu(dst));
1594         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1595         tcp_initialize_rcv_mss(newsk);
1596
1597         __tcp_v4_hash(newsk, 0);
1598         __tcp_inherit_port(sk, newsk);
1599
1600         return newsk;
1601
1602 exit_overflow:
1603         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1604 exit:
1605         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1606         dst_release(dst);
1607         return NULL;
1608 }
1609
1610 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1611 {
1612         struct tcphdr *th = skb->h.th;
1613         struct iphdr *iph = skb->nh.iph;
1614         struct tcp_sock *tp = tcp_sk(sk);
1615         struct sock *nsk;
1616         struct request_sock **prev;
1617         /* Find possible connection requests. */
1618         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1619                                                      iph->saddr, iph->daddr);
1620         if (req)
1621                 return tcp_check_req(sk, skb, req, prev);
1622
1623         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1624                                           th->source,
1625                                           skb->nh.iph->daddr,
1626                                           ntohs(th->dest),
1627                                           tcp_v4_iif(skb));
1628
1629         if (nsk) {
1630                 if (nsk->sk_state != TCP_TIME_WAIT) {
1631                         bh_lock_sock(nsk);
1632                         return nsk;
1633                 }
1634                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1635                 return NULL;
1636         }
1637
1638 #ifdef CONFIG_SYN_COOKIES
1639         if (!th->rst && !th->syn && th->ack)
1640                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1641 #endif
1642         return sk;
1643 }
1644
1645 static int tcp_v4_checksum_init(struct sk_buff *skb)
1646 {
1647         if (skb->ip_summed == CHECKSUM_HW) {
1648                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1649                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1650                                   skb->nh.iph->daddr, skb->csum))
1651                         return 0;
1652
1653                 NETDEBUG(if (net_ratelimit())
1654                                 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1655                 skb->ip_summed = CHECKSUM_NONE;
1656         }
1657         if (skb->len <= 76) {
1658                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1659                                  skb->nh.iph->daddr,
1660                                  skb_checksum(skb, 0, skb->len, 0)))
1661                         return -1;
1662                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1663         } else {
1664                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1665                                           skb->nh.iph->saddr,
1666                                           skb->nh.iph->daddr, 0);
1667         }
1668         return 0;
1669 }
1670
1671
1672 /* The socket must have it's spinlock held when we get
1673  * here.
1674  *
1675  * We have a potential double-lock case here, so even when
1676  * doing backlog processing we use the BH locking scheme.
1677  * This is because we cannot sleep with the original spinlock
1678  * held.
1679  */
1680 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1681 {
1682         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1683                 TCP_CHECK_TIMER(sk);
1684                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1685                         goto reset;
1686                 TCP_CHECK_TIMER(sk);
1687                 return 0;
1688         }
1689
1690         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1691                 goto csum_err;
1692
1693         if (sk->sk_state == TCP_LISTEN) {
1694                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1695                 if (!nsk)
1696                         goto discard;
1697
1698                 if (nsk != sk) {
1699                         if (tcp_child_process(sk, nsk, skb))
1700                                 goto reset;
1701                         return 0;
1702                 }
1703         }
1704
1705         TCP_CHECK_TIMER(sk);
1706         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1707                 goto reset;
1708         TCP_CHECK_TIMER(sk);
1709         return 0;
1710
1711 reset:
1712         tcp_v4_send_reset(skb);
1713 discard:
1714         kfree_skb(skb);
1715         /* Be careful here. If this function gets more complicated and
1716          * gcc suffers from register pressure on the x86, sk (in %ebx)
1717          * might be destroyed here. This current version compiles correctly,
1718          * but you have been warned.
1719          */
1720         return 0;
1721
1722 csum_err:
1723         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1724         goto discard;
1725 }
1726
1727 /*
1728  *      From tcp_input.c
1729  */
1730
1731 int tcp_v4_rcv(struct sk_buff *skb)
1732 {
1733         struct tcphdr *th;
1734         struct sock *sk;
1735         int ret;
1736
1737         if (skb->pkt_type != PACKET_HOST)
1738                 goto discard_it;
1739
1740         /* Count it even if it's bad */
1741         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1742
1743         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1744                 goto discard_it;
1745
1746         th = skb->h.th;
1747
1748         if (th->doff < sizeof(struct tcphdr) / 4)
1749                 goto bad_packet;
1750         if (!pskb_may_pull(skb, th->doff * 4))
1751                 goto discard_it;
1752
1753         /* An explanation is required here, I think.
1754          * Packet length and doff are validated by header prediction,
1755          * provided case of th->doff==0 is elimineted.
1756          * So, we defer the checks. */
1757         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1758              tcp_v4_checksum_init(skb) < 0))
1759                 goto bad_packet;
1760
1761         th = skb->h.th;
1762         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1763         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1764                                     skb->len - th->doff * 4);
1765         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1766         TCP_SKB_CB(skb)->when    = 0;
1767         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1768         TCP_SKB_CB(skb)->sacked  = 0;
1769
1770         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1771                              skb->nh.iph->daddr, ntohs(th->dest),
1772                              tcp_v4_iif(skb));
1773
1774         if (!sk)
1775                 goto no_tcp_socket;
1776
1777 process:
1778         if (sk->sk_state == TCP_TIME_WAIT)
1779                 goto do_time_wait;
1780
1781         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1782                 goto discard_and_relse;
1783
1784         if (sk_filter(sk, skb, 0))
1785                 goto discard_and_relse;
1786
1787         skb->dev = NULL;
1788
1789         bh_lock_sock(sk);
1790         ret = 0;
1791         if (!sock_owned_by_user(sk)) {
1792                 if (!tcp_prequeue(sk, skb))
1793                         ret = tcp_v4_do_rcv(sk, skb);
1794         } else
1795                 sk_add_backlog(sk, skb);
1796         bh_unlock_sock(sk);
1797
1798         sock_put(sk);
1799
1800         return ret;
1801
1802 no_tcp_socket:
1803         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1804                 goto discard_it;
1805
1806         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1807 bad_packet:
1808                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1809         } else {
1810                 tcp_v4_send_reset(skb);
1811         }
1812
1813 discard_it:
1814         /* Discard frame. */
1815         kfree_skb(skb);
1816         return 0;
1817
1818 discard_and_relse:
1819         sock_put(sk);
1820         goto discard_it;
1821
1822 do_time_wait:
1823         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1824                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1825                 goto discard_it;
1826         }
1827
1828         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1829                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1830                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1831                 goto discard_it;
1832         }
1833         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1834                                            skb, th, skb->len)) {
1835         case TCP_TW_SYN: {
1836                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1837                                                           ntohs(th->dest),
1838                                                           tcp_v4_iif(skb));
1839                 if (sk2) {
1840                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1841                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1842                         sk = sk2;
1843                         goto process;
1844                 }
1845                 /* Fall through to ACK */
1846         }
1847         case TCP_TW_ACK:
1848                 tcp_v4_timewait_ack(sk, skb);
1849                 break;
1850         case TCP_TW_RST:
1851                 goto no_tcp_socket;
1852         case TCP_TW_SUCCESS:;
1853         }
1854         goto discard_it;
1855 }
1856
1857 /* With per-bucket locks this operation is not-atomic, so that
1858  * this version is not worse.
1859  */
1860 static void __tcp_v4_rehash(struct sock *sk)
1861 {
1862         sk->sk_prot->unhash(sk);
1863         sk->sk_prot->hash(sk);
1864 }
1865
1866 static int tcp_v4_reselect_saddr(struct sock *sk)
1867 {
1868         struct inet_sock *inet = inet_sk(sk);
1869         int err;
1870         struct rtable *rt;
1871         __u32 old_saddr = inet->saddr;
1872         __u32 new_saddr;
1873         __u32 daddr = inet->daddr;
1874
1875         if (inet->opt && inet->opt->srr)
1876                 daddr = inet->opt->faddr;
1877
1878         /* Query new route. */
1879         err = ip_route_connect(&rt, daddr, 0,
1880                                RT_CONN_FLAGS(sk),
1881                                sk->sk_bound_dev_if,
1882                                IPPROTO_TCP,
1883                                inet->sport, inet->dport, sk);
1884         if (err)
1885                 return err;
1886
1887         __sk_dst_set(sk, &rt->u.dst);
1888         tcp_v4_setup_caps(sk, &rt->u.dst);
1889
1890         new_saddr = rt->rt_src;
1891
1892         if (new_saddr == old_saddr)
1893                 return 0;
1894
1895         if (sysctl_ip_dynaddr > 1) {
1896                 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1897                                  "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1898                        NIPQUAD(old_saddr),
1899                        NIPQUAD(new_saddr));
1900         }
1901
1902         inet->saddr = new_saddr;
1903         inet->rcv_saddr = new_saddr;
1904
1905         /* XXX The only one ugly spot where we need to
1906          * XXX really change the sockets identity after
1907          * XXX it has entered the hashes. -DaveM
1908          *
1909          * Besides that, it does not check for connection
1910          * uniqueness. Wait for troubles.
1911          */
1912         __tcp_v4_rehash(sk);
1913         return 0;
1914 }
1915
1916 int tcp_v4_rebuild_header(struct sock *sk)
1917 {
1918         struct inet_sock *inet = inet_sk(sk);
1919         struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1920         u32 daddr;
1921         int err;
1922
1923         /* Route is OK, nothing to do. */
1924         if (rt)
1925                 return 0;
1926
1927         /* Reroute. */
1928         daddr = inet->daddr;
1929         if (inet->opt && inet->opt->srr)
1930                 daddr = inet->opt->faddr;
1931
1932         {
1933                 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1934                                     .nl_u = { .ip4_u =
1935                                               { .daddr = daddr,
1936                                                 .saddr = inet->saddr,
1937                                                 .tos = RT_CONN_FLAGS(sk) } },
1938                                     .proto = IPPROTO_TCP,
1939                                     .uli_u = { .ports =
1940                                                { .sport = inet->sport,
1941                                                  .dport = inet->dport } } };
1942
1943                 err = ip_route_output_flow(&rt, &fl, sk, 0);
1944         }
1945         if (!err) {
1946                 __sk_dst_set(sk, &rt->u.dst);
1947                 tcp_v4_setup_caps(sk, &rt->u.dst);
1948                 return 0;
1949         }
1950
1951         /* Routing failed... */
1952         sk->sk_route_caps = 0;
1953
1954         if (!sysctl_ip_dynaddr ||
1955             sk->sk_state != TCP_SYN_SENT ||
1956             (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1957             (err = tcp_v4_reselect_saddr(sk)) != 0)
1958                 sk->sk_err_soft = -err;
1959
1960         return err;
1961 }
1962
1963 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1964 {
1965         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1966         struct inet_sock *inet = inet_sk(sk);
1967
1968         sin->sin_family         = AF_INET;
1969         sin->sin_addr.s_addr    = inet->daddr;
1970         sin->sin_port           = inet->dport;
1971 }
1972
1973 /* VJ's idea. Save last timestamp seen from this destination
1974  * and hold it at least for normal timewait interval to use for duplicate
1975  * segment detection in subsequent connections, before they enter synchronized
1976  * state.
1977  */
1978
1979 int tcp_v4_remember_stamp(struct sock *sk)
1980 {
1981         struct inet_sock *inet = inet_sk(sk);
1982         struct tcp_sock *tp = tcp_sk(sk);
1983         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1984         struct inet_peer *peer = NULL;
1985         int release_it = 0;
1986
1987         if (!rt || rt->rt_dst != inet->daddr) {
1988                 peer = inet_getpeer(inet->daddr, 1);
1989                 release_it = 1;
1990         } else {
1991                 if (!rt->peer)
1992                         rt_bind_peer(rt, 1);
1993                 peer = rt->peer;
1994         }
1995
1996         if (peer) {
1997                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1998                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1999                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
2000                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
2001                         peer->tcp_ts = tp->rx_opt.ts_recent;
2002                 }
2003                 if (release_it)
2004                         inet_putpeer(peer);
2005                 return 1;
2006         }
2007
2008         return 0;
2009 }
2010
2011 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2012 {
2013         struct inet_peer *peer = NULL;
2014
2015         peer = inet_getpeer(tw->tw_daddr, 1);
2016
2017         if (peer) {
2018                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2019                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2020                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2021                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2022                         peer->tcp_ts = tw->tw_ts_recent;
2023                 }
2024                 inet_putpeer(peer);
2025                 return 1;
2026         }
2027
2028         return 0;
2029 }
2030
2031 struct tcp_func ipv4_specific = {
2032         .queue_xmit     =       ip_queue_xmit,
2033         .send_check     =       tcp_v4_send_check,
2034         .rebuild_header =       tcp_v4_rebuild_header,
2035         .conn_request   =       tcp_v4_conn_request,
2036         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2037         .remember_stamp =       tcp_v4_remember_stamp,
2038         .net_header_len =       sizeof(struct iphdr),
2039         .setsockopt     =       ip_setsockopt,
2040         .getsockopt     =       ip_getsockopt,
2041         .addr2sockaddr  =       v4_addr2sockaddr,
2042         .sockaddr_len   =       sizeof(struct sockaddr_in),
2043 };
2044
2045 /* NOTE: A lot of things set to zero explicitly by call to
2046  *       sk_alloc() so need not be done here.
2047  */
2048 static int tcp_v4_init_sock(struct sock *sk)
2049 {
2050         struct tcp_sock *tp = tcp_sk(sk);
2051
2052         skb_queue_head_init(&tp->out_of_order_queue);
2053         tcp_init_xmit_timers(sk);
2054         tcp_prequeue_init(tp);
2055
2056         tp->rto  = TCP_TIMEOUT_INIT;
2057         tp->mdev = TCP_TIMEOUT_INIT;
2058
2059         /* So many TCP implementations out there (incorrectly) count the
2060          * initial SYN frame in their delayed-ACK and congestion control
2061          * algorithms that we must have the following bandaid to talk
2062          * efficiently to them.  -DaveM
2063          */
2064         tp->snd_cwnd = 2;
2065
2066         /* See draft-stevens-tcpca-spec-01 for discussion of the
2067          * initialization of these values.
2068          */
2069         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2070         tp->snd_cwnd_clamp = ~0;
2071         tp->mss_cache_std = tp->mss_cache = 536;
2072
2073         tp->reordering = sysctl_tcp_reordering;
2074
2075         sk->sk_state = TCP_CLOSE;
2076
2077         sk->sk_write_space = sk_stream_write_space;
2078         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2079
2080         tp->af_specific = &ipv4_specific;
2081
2082         sk->sk_sndbuf = sysctl_tcp_wmem[1];
2083         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2084
2085         atomic_inc(&tcp_sockets_allocated);
2086
2087         return 0;
2088 }
2089
2090 int tcp_v4_destroy_sock(struct sock *sk)
2091 {
2092         struct tcp_sock *tp = tcp_sk(sk);
2093
2094         tcp_clear_xmit_timers(sk);
2095
2096         /* Cleanup up the write buffer. */
2097         sk_stream_writequeue_purge(sk);
2098
2099         /* Cleans up our, hopefully empty, out_of_order_queue. */
2100         __skb_queue_purge(&tp->out_of_order_queue);
2101
2102         /* Clean prequeue, it must be empty really */
2103         __skb_queue_purge(&tp->ucopy.prequeue);
2104
2105         /* Clean up a referenced TCP bind bucket. */
2106         if (tp->bind_hash)
2107                 tcp_put_port(sk);
2108
2109         /*
2110          * If sendmsg cached page exists, toss it.
2111          */
2112         if (sk->sk_sndmsg_page) {
2113                 __free_page(sk->sk_sndmsg_page);
2114                 sk->sk_sndmsg_page = NULL;
2115         }
2116
2117         atomic_dec(&tcp_sockets_allocated);
2118
2119         return 0;
2120 }
2121
2122 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2123
2124 #ifdef CONFIG_PROC_FS
2125 /* Proc filesystem TCP sock list dumping. */
2126
2127 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2128 {
2129         return hlist_empty(head) ? NULL :
2130                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2131 }
2132
2133 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2134 {
2135         return tw->tw_node.next ?
2136                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2137 }
2138
2139 static void *listening_get_next(struct seq_file *seq, void *cur)
2140 {
2141         struct tcp_sock *tp;
2142         struct hlist_node *node;
2143         struct sock *sk = cur;
2144         struct tcp_iter_state* st = seq->private;
2145
2146         if (!sk) {
2147                 st->bucket = 0;
2148                 sk = sk_head(&tcp_listening_hash[0]);
2149                 goto get_sk;
2150         }
2151
2152         ++st->num;
2153
2154         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2155                 struct request_sock *req = cur;
2156
2157                 tp = tcp_sk(st->syn_wait_sk);
2158                 req = req->dl_next;
2159                 while (1) {
2160                         while (req) {
2161                                 if (req->rsk_ops->family == st->family) {
2162                                         cur = req;
2163                                         goto out;
2164                                 }
2165                                 req = req->dl_next;
2166                         }
2167                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2168                                 break;
2169 get_req:
2170                         req = tp->listen_opt->syn_table[st->sbucket];
2171                 }
2172                 sk        = sk_next(st->syn_wait_sk);
2173                 st->state = TCP_SEQ_STATE_LISTENING;
2174                 read_unlock_bh(&tp->syn_wait_lock);
2175         } else {
2176                 tp = tcp_sk(sk);
2177                 read_lock_bh(&tp->syn_wait_lock);
2178                 if (tp->listen_opt && tp->listen_opt->qlen)
2179                         goto start_req;
2180                 read_unlock_bh(&tp->syn_wait_lock);
2181                 sk = sk_next(sk);
2182         }
2183 get_sk:
2184         sk_for_each_from(sk, node) {
2185                 if (sk->sk_family == st->family) {
2186                         cur = sk;
2187                         goto out;
2188                 }
2189                 tp = tcp_sk(sk);
2190                 read_lock_bh(&tp->syn_wait_lock);
2191                 if (tp->listen_opt && tp->listen_opt->qlen) {
2192 start_req:
2193                         st->uid         = sock_i_uid(sk);
2194                         st->syn_wait_sk = sk;
2195                         st->state       = TCP_SEQ_STATE_OPENREQ;
2196                         st->sbucket     = 0;
2197                         goto get_req;
2198                 }
2199                 read_unlock_bh(&tp->syn_wait_lock);
2200         }
2201         if (++st->bucket < TCP_LHTABLE_SIZE) {
2202                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2203                 goto get_sk;
2204         }
2205         cur = NULL;
2206 out:
2207         return cur;
2208 }
2209
2210 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2211 {
2212         void *rc = listening_get_next(seq, NULL);
2213
2214         while (rc && *pos) {
2215                 rc = listening_get_next(seq, rc);
2216                 --*pos;
2217         }
2218         return rc;
2219 }
2220
2221 static void *established_get_first(struct seq_file *seq)
2222 {
2223         struct tcp_iter_state* st = seq->private;
2224         void *rc = NULL;
2225
2226         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2227                 struct sock *sk;
2228                 struct hlist_node *node;
2229                 struct tcp_tw_bucket *tw;
2230
2231                 /* We can reschedule _before_ having picked the target: */
2232                 cond_resched_softirq();
2233
2234                 read_lock(&tcp_ehash[st->bucket].lock);
2235                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2236                         if (sk->sk_family != st->family) {
2237                                 continue;
2238                         }
2239                         rc = sk;
2240                         goto out;
2241                 }
2242                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2243                 tw_for_each(tw, node,
2244                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2245                         if (tw->tw_family != st->family) {
2246                                 continue;
2247                         }
2248                         rc = tw;
2249                         goto out;
2250                 }
2251                 read_unlock(&tcp_ehash[st->bucket].lock);
2252                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2253         }
2254 out:
2255         return rc;
2256 }
2257
2258 static void *established_get_next(struct seq_file *seq, void *cur)
2259 {
2260         struct sock *sk = cur;
2261         struct tcp_tw_bucket *tw;
2262         struct hlist_node *node;
2263         struct tcp_iter_state* st = seq->private;
2264
2265         ++st->num;
2266
2267         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2268                 tw = cur;
2269                 tw = tw_next(tw);
2270 get_tw:
2271                 while (tw && tw->tw_family != st->family) {
2272                         tw = tw_next(tw);
2273                 }
2274                 if (tw) {
2275                         cur = tw;
2276                         goto out;
2277                 }
2278                 read_unlock(&tcp_ehash[st->bucket].lock);
2279                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2280
2281                 /* We can reschedule between buckets: */
2282                 cond_resched_softirq();
2283
2284                 if (++st->bucket < tcp_ehash_size) {
2285                         read_lock(&tcp_ehash[st->bucket].lock);
2286                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2287                 } else {
2288                         cur = NULL;
2289                         goto out;
2290                 }
2291         } else
2292                 sk = sk_next(sk);
2293
2294         sk_for_each_from(sk, node) {
2295                 if (sk->sk_family == st->family)
2296                         goto found;
2297         }
2298
2299         st->state = TCP_SEQ_STATE_TIME_WAIT;
2300         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2301         goto get_tw;
2302 found:
2303         cur = sk;
2304 out:
2305         return cur;
2306 }
2307
2308 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2309 {
2310         void *rc = established_get_first(seq);
2311
2312         while (rc && pos) {
2313                 rc = established_get_next(seq, rc);
2314                 --pos;
2315         }
2316         return rc;
2317 }
2318
2319 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2320 {
2321         void *rc;
2322         struct tcp_iter_state* st = seq->private;
2323
2324         tcp_listen_lock();
2325         st->state = TCP_SEQ_STATE_LISTENING;
2326         rc        = listening_get_idx(seq, &pos);
2327
2328         if (!rc) {
2329                 tcp_listen_unlock();
2330                 local_bh_disable();
2331                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2332                 rc        = established_get_idx(seq, pos);
2333         }
2334
2335         return rc;
2336 }
2337
2338 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2339 {
2340         struct tcp_iter_state* st = seq->private;
2341         st->state = TCP_SEQ_STATE_LISTENING;
2342         st->num = 0;
2343         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2344 }
2345
2346 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2347 {
2348         void *rc = NULL;
2349         struct tcp_iter_state* st;
2350
2351         if (v == SEQ_START_TOKEN) {
2352                 rc = tcp_get_idx(seq, 0);
2353                 goto out;
2354         }
2355         st = seq->private;
2356
2357         switch (st->state) {
2358         case TCP_SEQ_STATE_OPENREQ:
2359         case TCP_SEQ_STATE_LISTENING:
2360                 rc = listening_get_next(seq, v);
2361                 if (!rc) {
2362                         tcp_listen_unlock();
2363                         local_bh_disable();
2364                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2365                         rc        = established_get_first(seq);
2366                 }
2367                 break;
2368         case TCP_SEQ_STATE_ESTABLISHED:
2369         case TCP_SEQ_STATE_TIME_WAIT:
2370                 rc = established_get_next(seq, v);
2371                 break;
2372         }
2373 out:
2374         ++*pos;
2375         return rc;
2376 }
2377
2378 static void tcp_seq_stop(struct seq_file *seq, void *v)
2379 {
2380         struct tcp_iter_state* st = seq->private;
2381
2382         switch (st->state) {
2383         case TCP_SEQ_STATE_OPENREQ:
2384                 if (v) {
2385                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2386                         read_unlock_bh(&tp->syn_wait_lock);
2387                 }
2388         case TCP_SEQ_STATE_LISTENING:
2389                 if (v != SEQ_START_TOKEN)
2390                         tcp_listen_unlock();
2391                 break;
2392         case TCP_SEQ_STATE_TIME_WAIT:
2393         case TCP_SEQ_STATE_ESTABLISHED:
2394                 if (v)
2395                         read_unlock(&tcp_ehash[st->bucket].lock);
2396                 local_bh_enable();
2397                 break;
2398         }
2399 }
2400
2401 static int tcp_seq_open(struct inode *inode, struct file *file)
2402 {
2403         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2404         struct seq_file *seq;
2405         struct tcp_iter_state *s;
2406         int rc;
2407
2408         if (unlikely(afinfo == NULL))
2409                 return -EINVAL;
2410
2411         s = kmalloc(sizeof(*s), GFP_KERNEL);
2412         if (!s)
2413                 return -ENOMEM;
2414         memset(s, 0, sizeof(*s));
2415         s->family               = afinfo->family;
2416         s->seq_ops.start        = tcp_seq_start;
2417         s->seq_ops.next         = tcp_seq_next;
2418         s->seq_ops.show         = afinfo->seq_show;
2419         s->seq_ops.stop         = tcp_seq_stop;
2420
2421         rc = seq_open(file, &s->seq_ops);
2422         if (rc)
2423                 goto out_kfree;
2424         seq          = file->private_data;
2425         seq->private = s;
2426 out:
2427         return rc;
2428 out_kfree:
2429         kfree(s);
2430         goto out;
2431 }
2432
2433 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2434 {
2435         int rc = 0;
2436         struct proc_dir_entry *p;
2437
2438         if (!afinfo)
2439                 return -EINVAL;
2440         afinfo->seq_fops->owner         = afinfo->owner;
2441         afinfo->seq_fops->open          = tcp_seq_open;
2442         afinfo->seq_fops->read          = seq_read;
2443         afinfo->seq_fops->llseek        = seq_lseek;
2444         afinfo->seq_fops->release       = seq_release_private;
2445
2446         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2447         if (p)
2448                 p->data = afinfo;
2449         else
2450                 rc = -ENOMEM;
2451         return rc;
2452 }
2453
2454 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2455 {
2456         if (!afinfo)
2457                 return;
2458         proc_net_remove(afinfo->name);
2459         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2460 }
2461
2462 static void get_openreq4(struct sock *sk, struct request_sock *req,
2463                          char *tmpbuf, int i, int uid)
2464 {
2465         const struct inet_request_sock *ireq = inet_rsk(req);
2466         int ttd = req->expires - jiffies;
2467
2468         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2469                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2470                 i,
2471                 ireq->loc_addr,
2472                 ntohs(inet_sk(sk)->sport),
2473                 ireq->rmt_addr,
2474                 ntohs(ireq->rmt_port),
2475                 TCP_SYN_RECV,
2476                 0, 0, /* could print option size, but that is af dependent. */
2477                 1,    /* timers active (only the expire timer) */
2478                 jiffies_to_clock_t(ttd),
2479                 req->retrans,
2480                 uid,
2481                 0,  /* non standard timer */
2482                 0, /* open_requests have no inode */
2483                 atomic_read(&sk->sk_refcnt),
2484                 req);
2485 }
2486
2487 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2488 {
2489         int timer_active;
2490         unsigned long timer_expires;
2491         struct tcp_sock *tp = tcp_sk(sp);
2492         struct inet_sock *inet = inet_sk(sp);
2493         unsigned int dest = inet->daddr;
2494         unsigned int src = inet->rcv_saddr;
2495         __u16 destp = ntohs(inet->dport);
2496         __u16 srcp = ntohs(inet->sport);
2497
2498         if (tp->pending == TCP_TIME_RETRANS) {
2499                 timer_active    = 1;
2500                 timer_expires   = tp->timeout;
2501         } else if (tp->pending == TCP_TIME_PROBE0) {
2502                 timer_active    = 4;
2503                 timer_expires   = tp->timeout;
2504         } else if (timer_pending(&sp->sk_timer)) {
2505                 timer_active    = 2;
2506                 timer_expires   = sp->sk_timer.expires;
2507         } else {
2508                 timer_active    = 0;
2509                 timer_expires = jiffies;
2510         }
2511
2512         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2513                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2514                 i, src, srcp, dest, destp, sp->sk_state,
2515                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2516                 timer_active,
2517                 jiffies_to_clock_t(timer_expires - jiffies),
2518                 tp->retransmits,
2519                 sock_i_uid(sp),
2520                 tp->probes_out,
2521                 sock_i_ino(sp),
2522                 atomic_read(&sp->sk_refcnt), sp,
2523                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2524                 tp->snd_cwnd,
2525                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2526 }
2527
2528 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2529 {
2530         unsigned int dest, src;
2531         __u16 destp, srcp;
2532         int ttd = tw->tw_ttd - jiffies;
2533
2534         if (ttd < 0)
2535                 ttd = 0;
2536
2537         dest  = tw->tw_daddr;
2538         src   = tw->tw_rcv_saddr;
2539         destp = ntohs(tw->tw_dport);
2540         srcp  = ntohs(tw->tw_sport);
2541
2542         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2543                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2544                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2545                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2546                 atomic_read(&tw->tw_refcnt), tw);
2547 }
2548
2549 #define TMPSZ 150
2550
2551 static int tcp4_seq_show(struct seq_file *seq, void *v)
2552 {
2553         struct tcp_iter_state* st;
2554         char tmpbuf[TMPSZ + 1];
2555
2556         if (v == SEQ_START_TOKEN) {
2557                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2558                            "  sl  local_address rem_address   st tx_queue "
2559                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2560                            "inode");
2561                 goto out;
2562         }
2563         st = seq->private;
2564
2565         switch (st->state) {
2566         case TCP_SEQ_STATE_LISTENING:
2567         case TCP_SEQ_STATE_ESTABLISHED:
2568                 get_tcp4_sock(v, tmpbuf, st->num);
2569                 break;
2570         case TCP_SEQ_STATE_OPENREQ:
2571                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2572                 break;
2573         case TCP_SEQ_STATE_TIME_WAIT:
2574                 get_timewait4_sock(v, tmpbuf, st->num);
2575                 break;
2576         }
2577         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2578 out:
2579         return 0;
2580 }
2581
2582 static struct file_operations tcp4_seq_fops;
2583 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2584         .owner          = THIS_MODULE,
2585         .name           = "tcp",
2586         .family         = AF_INET,
2587         .seq_show       = tcp4_seq_show,
2588         .seq_fops       = &tcp4_seq_fops,
2589 };
2590
2591 int __init tcp4_proc_init(void)
2592 {
2593         return tcp_proc_register(&tcp4_seq_afinfo);
2594 }
2595
2596 void tcp4_proc_exit(void)
2597 {
2598         tcp_proc_unregister(&tcp4_seq_afinfo);
2599 }
2600 #endif /* CONFIG_PROC_FS */
2601
2602 struct proto tcp_prot = {
2603         .name                   = "TCP",
2604         .owner                  = THIS_MODULE,
2605         .close                  = tcp_close,
2606         .connect                = tcp_v4_connect,
2607         .disconnect             = tcp_disconnect,
2608         .accept                 = tcp_accept,
2609         .ioctl                  = tcp_ioctl,
2610         .init                   = tcp_v4_init_sock,
2611         .destroy                = tcp_v4_destroy_sock,
2612         .shutdown               = tcp_shutdown,
2613         .setsockopt             = tcp_setsockopt,
2614         .getsockopt             = tcp_getsockopt,
2615         .sendmsg                = tcp_sendmsg,
2616         .recvmsg                = tcp_recvmsg,
2617         .backlog_rcv            = tcp_v4_do_rcv,
2618         .hash                   = tcp_v4_hash,
2619         .unhash                 = tcp_unhash,
2620         .get_port               = tcp_v4_get_port,
2621         .enter_memory_pressure  = tcp_enter_memory_pressure,
2622         .sockets_allocated      = &tcp_sockets_allocated,
2623         .memory_allocated       = &tcp_memory_allocated,
2624         .memory_pressure        = &tcp_memory_pressure,
2625         .sysctl_mem             = sysctl_tcp_mem,
2626         .sysctl_wmem            = sysctl_tcp_wmem,
2627         .sysctl_rmem            = sysctl_tcp_rmem,
2628         .max_header             = MAX_TCP_HEADER,
2629         .obj_size               = sizeof(struct tcp_sock),
2630         .rsk_prot               = &tcp_request_sock_ops,
2631 };
2632
2633
2634
2635 void __init tcp_v4_init(struct net_proto_family *ops)
2636 {
2637         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2638         if (err < 0)
2639                 panic("Failed to create the TCP control socket.\n");
2640         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2641         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2642
2643         /* Unhash it so that IP input processing does not even
2644          * see it, we do not wish this socket to see incoming
2645          * packets.
2646          */
2647         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2648 }
2649
2650 EXPORT_SYMBOL(ipv4_specific);
2651 EXPORT_SYMBOL(tcp_bind_hash);
2652 EXPORT_SYMBOL(tcp_bucket_create);
2653 EXPORT_SYMBOL(tcp_hashinfo);
2654 EXPORT_SYMBOL(tcp_inherit_port);
2655 EXPORT_SYMBOL(tcp_listen_wlock);
2656 EXPORT_SYMBOL(tcp_port_rover);
2657 EXPORT_SYMBOL(tcp_prot);
2658 EXPORT_SYMBOL(tcp_put_port);
2659 EXPORT_SYMBOL(tcp_unhash);
2660 EXPORT_SYMBOL(tcp_v4_conn_request);
2661 EXPORT_SYMBOL(tcp_v4_connect);
2662 EXPORT_SYMBOL(tcp_v4_do_rcv);
2663 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2664 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2665 EXPORT_SYMBOL(tcp_v4_send_check);
2666 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2667
2668 #ifdef CONFIG_PROC_FS
2669 EXPORT_SYMBOL(tcp_proc_register);
2670 EXPORT_SYMBOL(tcp_proc_unregister);
2671 #endif
2672 EXPORT_SYMBOL(sysctl_local_port_range);
2673 EXPORT_SYMBOL(sysctl_max_syn_backlog);
2674 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2675 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2676