net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/bootmem.h>
 260
 261 #include <net/icmp.h>
 262 #include <net/tcp.h>
 263 #include <net/xfrm.h>
 264 #include <net/ip.h>
 265
 266
 267 #include <asm/uaccess.h>
 268 #include <asm/ioctls.h>
 269
 270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 271
 272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 273
 274 kmem_cache_t *tcp_bucket_cachep;
 275 kmem_cache_t *tcp_timewait_cachep;
 276
 277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 278
 279 int sysctl_tcp_mem[3];
 280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 282
 283 EXPORT_SYMBOL(sysctl_tcp_mem);
 284 EXPORT_SYMBOL(sysctl_tcp_rmem);
 285 EXPORT_SYMBOL(sysctl_tcp_wmem);
 286
 287 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 288 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 289
 290 EXPORT_SYMBOL(tcp_memory_allocated);
 291 EXPORT_SYMBOL(tcp_sockets_allocated);
 292
 293 /*
 294  * Pressure flag: try to collapse.
 295  * Technical note: it is used by multiple contexts non atomically.
 296  * All the sk_stream_mem_schedule() is of this nature: accounting
 297  * is strict, actions are advisory and have some latency.
 298  */
 299 int tcp_memory_pressure;
 300
 301 EXPORT_SYMBOL(tcp_memory_pressure);
 302
 303 void tcp_enter_memory_pressure(void)
 304 {
 305         if (!tcp_memory_pressure) {
 306                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 307                 tcp_memory_pressure = 1;
 308         }
 309 }
 310
 311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 312
 313 /*
 314  * LISTEN is a special case for poll..
 315  */
 316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 317                                                poll_table *wait)
 318 {
 319         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 320 }
 321
 322 /*
 323  *      Wait for a TCP event.
 324  *
 325  *      Note that we don't need to lock the socket, as the upper poll layers
 326  *      take care of normal races (between the test and the event) and we don't
 327  *      go look at any of the socket buffers directly.
 328  */
 329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 330 {
 331         unsigned int mask;
 332         struct sock *sk = sock->sk;
 333         struct tcp_sock *tp = tcp_sk(sk);
 334
 335         poll_wait(file, sk->sk_sleep, wait);
 336         if (sk->sk_state == TCP_LISTEN)
 337                 return tcp_listen_poll(sk, wait);
 338
 339         /* Socket is not locked. We are protected from async events
 340            by poll logic and correct handling of state changes
 341            made by another threads is impossible in any case.
 342          */
 343
 344         mask = 0;
 345         if (sk->sk_err)
 346                 mask = POLLERR;
 347
 348         /*
 349          * POLLHUP is certainly not done right. But poll() doesn't
 350          * have a notion of HUP in just one direction, and for a
 351          * socket the read side is more interesting.
 352          *
 353          * Some poll() documentation says that POLLHUP is incompatible
 354          * with the POLLOUT/POLLWR flags, so somebody should check this
 355          * all. But careful, it tends to be safer to return too many
 356          * bits than too few, and you can easily break real applications
 357          * if you don't tell them that something has hung up!
 358          *
 359          * Check-me.
 360          *
 361          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 362          * our fs/select.c). It means that after we received EOF,
 363          * poll always returns immediately, making impossible poll() on write()
 364          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 365          * if and only if shutdown has been made in both directions.
 366          * Actually, it is interesting to look how Solaris and DUX
 367          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 368          * then we could set it on SND_SHUTDOWN. BTW examples given
 369          * in Stevens' books assume exactly this behaviour, it explains
 370          * why PULLHUP is incompatible with POLLOUT.    --ANK
 371          *
 372          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 373          * blocking on fresh not-connected or disconnected socket. --ANK
 374          */
 375         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 376                 mask |= POLLHUP;
 377         if (sk->sk_shutdown & RCV_SHUTDOWN)
 378                 mask |= POLLIN | POLLRDNORM;
 379
 380         /* Connected? */
 381         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 382                 /* Potential race condition. If read of tp below will
 383                  * escape above sk->sk_state, we can be illegally awaken
 384                  * in SYN_* states. */
 385                 if ((tp->rcv_nxt != tp->copied_seq) &&
 386                     (tp->urg_seq != tp->copied_seq ||
 387                      tp->rcv_nxt != tp->copied_seq + 1 ||
 388                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 389                         mask |= POLLIN | POLLRDNORM;
 390
 391                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 392                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 393                                 mask |= POLLOUT | POLLWRNORM;
 394                         } else {  /* send SIGIO later */
 395                                 set_bit(SOCK_ASYNC_NOSPACE,
 396                                         &sk->sk_socket->flags);
 397                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 398
 399                                 /* Race breaker. If space is freed after
 400                                  * wspace test but before the flags are set,
 401                                  * IO signal will be lost.
 402                                  */
 403                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 404                                         mask |= POLLOUT | POLLWRNORM;
 405                         }
 406                 }
 407
 408                 if (tp->urg_data & TCP_URG_VALID)
 409                         mask |= POLLPRI;
 410         }
 411         return mask;
 412 }
 413
 414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 415 {
 416         struct tcp_sock *tp = tcp_sk(sk);
 417         int answ;
 418
 419         switch (cmd) {
 420         case SIOCINQ:
 421                 if (sk->sk_state == TCP_LISTEN)
 422                         return -EINVAL;
 423
 424                 lock_sock(sk);
 425                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 426                         answ = 0;
 427                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 428                          !tp->urg_data ||
 429                          before(tp->urg_seq, tp->copied_seq) ||
 430                          !before(tp->urg_seq, tp->rcv_nxt)) {
 431                         answ = tp->rcv_nxt - tp->copied_seq;
 432
 433                         /* Subtract 1, if FIN is in queue. */
 434                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 435                                 answ -=
 436                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 437                 } else
 438                         answ = tp->urg_seq - tp->copied_seq;
 439                 release_sock(sk);
 440                 break;
 441         case SIOCATMARK:
 442                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 443                 break;
 444         case SIOCOUTQ:
 445                 if (sk->sk_state == TCP_LISTEN)
 446                         return -EINVAL;
 447
 448                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 449                         answ = 0;
 450                 else
 451                         answ = tp->write_seq - tp->snd_una;
 452                 break;
 453         default:
 454                 return -ENOIOCTLCMD;
 455         };
 456
 457         return put_user(answ, (int __user *)arg);
 458 }
 459
 460
 461 int tcp_listen_start(struct sock *sk)
 462 {
 463         struct inet_sock *inet = inet_sk(sk);
 464         struct tcp_sock *tp = tcp_sk(sk);
 465         struct tcp_listen_opt *lopt;
 466
 467         sk->sk_max_ack_backlog = 0;
 468         sk->sk_ack_backlog = 0;
 469         tp->accept_queue = tp->accept_queue_tail = NULL;
 470         rwlock_init(&tp->syn_wait_lock);
 471         tcp_delack_init(tp);
 472
 473         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 474         if (!lopt)
 475                 return -ENOMEM;
 476
 477         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 478         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 479                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 480                         break;
 481         get_random_bytes(&lopt->hash_rnd, 4);
 482
 483         write_lock_bh(&tp->syn_wait_lock);
 484         tp->listen_opt = lopt;
 485         write_unlock_bh(&tp->syn_wait_lock);
 486
 487         /* There is race window here: we announce ourselves listening,
 488          * but this transition is still not validated by get_port().
 489          * It is OK, because this socket enters to hash table only
 490          * after validation is complete.
 491          */
 492         sk->sk_state = TCP_LISTEN;
 493         if (!sk->sk_prot->get_port(sk, inet->num)) {
 494                 inet->sport = htons(inet->num);
 495
 496                 sk_dst_reset(sk);
 497                 sk->sk_prot->hash(sk);
 498
 499                 return 0;
 500         }
 501
 502         sk->sk_state = TCP_CLOSE;
 503         write_lock_bh(&tp->syn_wait_lock);
 504         tp->listen_opt = NULL;
 505         write_unlock_bh(&tp->syn_wait_lock);
 506         kfree(lopt);
 507         return -EADDRINUSE;
 508 }
 509
 510 /*
 511  *      This routine closes sockets which have been at least partially
 512  *      opened, but not yet accepted.
 513  */
 514
 515 static void tcp_listen_stop (struct sock *sk)
 516 {
 517         struct tcp_sock *tp = tcp_sk(sk);
 518         struct tcp_listen_opt *lopt = tp->listen_opt;
 519         struct request_sock *acc_req = tp->accept_queue;
 520         struct request_sock *req;
 521         int i;
 522
 523         tcp_delete_keepalive_timer(sk);
 524
 525         /* make all the listen_opt local to us */
 526         write_lock_bh(&tp->syn_wait_lock);
 527         tp->listen_opt = NULL;
 528         write_unlock_bh(&tp->syn_wait_lock);
 529         tp->accept_queue = tp->accept_queue_tail = NULL;
 530
 531         if (lopt->qlen) {
 532                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 533                         while ((req = lopt->syn_table[i]) != NULL) {
 534                                 lopt->syn_table[i] = req->dl_next;
 535                                 lopt->qlen--;
 536                                 reqsk_free(req);
 537
 538                 /* Following specs, it would be better either to send FIN
 539                  * (and enter FIN-WAIT-1, it is normal close)
 540                  * or to send active reset (abort).
 541                  * Certainly, it is pretty dangerous while synflood, but it is
 542                  * bad justification for our negligence 8)
 543                  * To be honest, we are not able to make either
 544                  * of the variants now.                 --ANK
 545                  */
 546                         }
 547                 }
 548         }
 549         BUG_TRAP(!lopt->qlen);
 550
 551         kfree(lopt);
 552
 553         while ((req = acc_req) != NULL) {
 554                 struct sock *child = req->sk;
 555
 556                 acc_req = req->dl_next;
 557
 558                 local_bh_disable();
 559                 bh_lock_sock(child);
 560                 BUG_TRAP(!sock_owned_by_user(child));
 561                 sock_hold(child);
 562
 563                 tcp_disconnect(child, O_NONBLOCK);
 564
 565                 sock_orphan(child);
 566
 567                 atomic_inc(&tcp_orphan_count);
 568
 569                 tcp_destroy_sock(child);
 570
 571                 bh_unlock_sock(child);
 572                 local_bh_enable();
 573                 sock_put(child);
 574
 575                 sk_acceptq_removed(sk);
 576                 __reqsk_free(req);
 577         }
 578         BUG_TRAP(!sk->sk_ack_backlog);
 579 }
 580
 581 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 582 {
 583         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 584         tp->pushed_seq = tp->write_seq;
 585 }
 586
 587 static inline int forced_push(struct tcp_sock *tp)
 588 {
 589         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 590 }
 591
 592 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 593                               struct sk_buff *skb)
 594 {
 595         skb->csum = 0;
 596         TCP_SKB_CB(skb)->seq = tp->write_seq;
 597         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 598         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 599         TCP_SKB_CB(skb)->sacked = 0;
 600         skb_header_release(skb);
 601         __skb_queue_tail(&sk->sk_write_queue, skb);
 602         sk_charge_skb(sk, skb);
 603         if (!sk->sk_send_head)
 604                 sk->sk_send_head = skb;
 605         else if (tp->nonagle&TCP_NAGLE_PUSH)
 606                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 607 }
 608
 609 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 610                                 struct sk_buff *skb)
 611 {
 612         if (flags & MSG_OOB) {
 613                 tp->urg_mode = 1;
 614                 tp->snd_up = tp->write_seq;
 615                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 616         }
 617 }
 618
 619 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
 620                             int mss_now, int nonagle)
 621 {
 622         if (sk->sk_send_head) {
 623                 struct sk_buff *skb = sk->sk_write_queue.prev;
 624                 if (!(flags & MSG_MORE) || forced_push(tp))
 625                         tcp_mark_push(tp, skb);
 626                 tcp_mark_urg(tp, flags, skb);
 627                 __tcp_push_pending_frames(sk, tp, mss_now,
 628                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 629         }
 630 }
 631
 632 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 633                          size_t psize, int flags)
 634 {
 635         struct tcp_sock *tp = tcp_sk(sk);
 636         int mss_now;
 637         int err;
 638         ssize_t copied;
 639         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 640
 641         /* Wait for a connection to finish. */
 642         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 643                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 644                         goto out_err;
 645
 646         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 647
 648         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 649         copied = 0;
 650
 651         err = -EPIPE;
 652         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 653                 goto do_error;
 654
 655         while (psize > 0) {
 656                 struct sk_buff *skb = sk->sk_write_queue.prev;
 657                 struct page *page = pages[poffset / PAGE_SIZE];
 658                 int copy, i, can_coalesce;
 659                 int offset = poffset % PAGE_SIZE;
 660                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 661
 662                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
 663 new_segment:
 664                         if (!sk_stream_memory_free(sk))
 665                                 goto wait_for_sndbuf;
 666
 667                         skb = sk_stream_alloc_pskb(sk, 0, 0,
 668                                                    sk->sk_allocation);
 669                         if (!skb)
 670                                 goto wait_for_memory;
 671
 672                         skb_entail(sk, tp, skb);
 673                         copy = mss_now;
 674                 }
 675
 676                 if (copy > size)
 677                         copy = size;
 678
 679                 i = skb_shinfo(skb)->nr_frags;
 680                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 681                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 682                         tcp_mark_push(tp, skb);
 683                         goto new_segment;
 684                 }
 685                 if (sk->sk_forward_alloc < copy &&
 686                     !sk_stream_mem_schedule(sk, copy, 0))
 687                         goto wait_for_memory;
 688
 689                 if (can_coalesce) {
 690                         skb_shinfo(skb)->frags[i - 1].size += copy;
 691                 } else {
 692                         get_page(page);
 693                         skb_fill_page_desc(skb, i, page, offset, copy);
 694                 }
 695
 696                 skb->len += copy;
 697                 skb->data_len += copy;
 698                 skb->truesize += copy;
 699                 sk->sk_wmem_queued += copy;
 700                 sk->sk_forward_alloc -= copy;
 701                 skb->ip_summed = CHECKSUM_HW;
 702                 tp->write_seq += copy;
 703                 TCP_SKB_CB(skb)->end_seq += copy;
 704                 skb_shinfo(skb)->tso_segs = 0;
 705
 706                 if (!copied)
 707                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 708
 709                 copied += copy;
 710                 poffset += copy;
 711                 if (!(psize -= copy))
 712                         goto out;
 713
 714                 if (skb->len != mss_now || (flags & MSG_OOB))
 715                         continue;
 716
 717                 if (forced_push(tp)) {
 718                         tcp_mark_push(tp, skb);
 719                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 720                 } else if (skb == sk->sk_send_head)
 721                         tcp_push_one(sk, mss_now);
 722                 continue;
 723
 724 wait_for_sndbuf:
 725                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 726 wait_for_memory:
 727                 if (copied)
 728                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 729
 730                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 731                         goto do_error;
 732
 733                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 734         }
 735
 736 out:
 737         if (copied)
 738                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 739         return copied;
 740
 741 do_error:
 742         if (copied)
 743                 goto out;
 744 out_err:
 745         return sk_stream_error(sk, flags, err);
 746 }
 747
 748 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 749                      size_t size, int flags)
 750 {
 751         ssize_t res;
 752         struct sock *sk = sock->sk;
 753
 754 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 755
 756         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 757             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 758                 return sock_no_sendpage(sock, page, offset, size, flags);
 759
 760 #undef TCP_ZC_CSUM_FLAGS
 761
 762         lock_sock(sk);
 763         TCP_CHECK_TIMER(sk);
 764         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 765         TCP_CHECK_TIMER(sk);
 766         release_sock(sk);
 767         return res;
 768 }
 769
 770 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 771 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 772
 773 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 774 {
 775         int tmp = tp->mss_cache_std;
 776
 777         if (sk->sk_route_caps & NETIF_F_SG) {
 778                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 779
 780                 if (tmp >= pgbreak &&
 781                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 782                         tmp = pgbreak;
 783         }
 784         return tmp;
 785 }
 786
 787 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 788                 size_t size)
 789 {
 790         struct iovec *iov;
 791         struct tcp_sock *tp = tcp_sk(sk);
 792         struct sk_buff *skb;
 793         int iovlen, flags;
 794         int mss_now;
 795         int err, copied;
 796         long timeo;
 797
 798         lock_sock(sk);
 799         TCP_CHECK_TIMER(sk);
 800
 801         flags = msg->msg_flags;
 802         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 803
 804         /* Wait for a connection to finish. */
 805         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 806                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 807                         goto out_err;
 808
 809         /* This should be in poll */
 810         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 811
 812         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 813
 814         /* Ok commence sending. */
 815         iovlen = msg->msg_iovlen;
 816         iov = msg->msg_iov;
 817         copied = 0;
 818
 819         err = -EPIPE;
 820         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 821                 goto do_error;
 822
 823         while (--iovlen >= 0) {
 824                 int seglen = iov->iov_len;
 825                 unsigned char __user *from = iov->iov_base;
 826
 827                 iov++;
 828
 829                 while (seglen > 0) {
 830                         int copy;
 831
 832                         skb = sk->sk_write_queue.prev;
 833
 834                         if (!sk->sk_send_head ||
 835                             (copy = mss_now - skb->len) <= 0) {
 836
 837 new_segment:
 838                                 /* Allocate new segment. If the interface is SG,
 839                                  * allocate skb fitting to single page.
 840                                  */
 841                                 if (!sk_stream_memory_free(sk))
 842                                         goto wait_for_sndbuf;
 843
 844                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 845                                                            0, sk->sk_allocation);
 846                                 if (!skb)
 847                                         goto wait_for_memory;
 848
 849                                 /*
 850                                  * Check whether we can use HW checksum.
 851                                  */
 852                                 if (sk->sk_route_caps &
 853                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
 854                                      NETIF_F_HW_CSUM))
 855                                         skb->ip_summed = CHECKSUM_HW;
 856
 857                                 skb_entail(sk, tp, skb);
 858                                 copy = mss_now;
 859                         }
 860
 861                         /* Try to append data to the end of skb. */
 862                         if (copy > seglen)
 863                                 copy = seglen;
 864
 865                         /* Where to copy to? */
 866                         if (skb_tailroom(skb) > 0) {
 867                                 /* We have some space in skb head. Superb! */
 868                                 if (copy > skb_tailroom(skb))
 869                                         copy = skb_tailroom(skb);
 870                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 871                                         goto do_fault;
 872                         } else {
 873                                 int merge = 0;
 874                                 int i = skb_shinfo(skb)->nr_frags;
 875                                 struct page *page = TCP_PAGE(sk);
 876                                 int off = TCP_OFF(sk);
 877
 878                                 if (skb_can_coalesce(skb, i, page, off) &&
 879                                     off != PAGE_SIZE) {
 880                                         /* We can extend the last page
 881                                          * fragment. */
 882                                         merge = 1;
 883                                 } else if (i == MAX_SKB_FRAGS ||
 884                                            (!i &&
 885                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 886                                         /* Need to add new fragment and cannot
 887                                          * do this because interface is non-SG,
 888                                          * or because all the page slots are
 889                                          * busy. */
 890                                         tcp_mark_push(tp, skb);
 891                                         goto new_segment;
 892                                 } else if (page) {
 893                                         /* If page is cached, align
 894                                          * offset to L1 cache boundary
 895                                          */
 896                                         off = (off + L1_CACHE_BYTES - 1) &
 897                                               ~(L1_CACHE_BYTES - 1);
 898                                         if (off == PAGE_SIZE) {
 899                                                 put_page(page);
 900                                                 TCP_PAGE(sk) = page = NULL;
 901                                         }
 902                                 }
 903
 904                                 if (!page) {
 905                                         /* Allocate new cache page. */
 906                                         if (!(page = sk_stream_alloc_page(sk)))
 907                                                 goto wait_for_memory;
 908                                         off = 0;
 909                                 }
 910
 911                                 if (copy > PAGE_SIZE - off)
 912                                         copy = PAGE_SIZE - off;
 913
 914                                 /* Time to copy data. We are close to
 915                                  * the end! */
 916                                 err = skb_copy_to_page(sk, from, skb, page,
 917                                                        off, copy);
 918                                 if (err) {
 919                                         /* If this page was new, give it to the
 920                                          * socket so it does not get leaked.
 921                                          */
 922                                         if (!TCP_PAGE(sk)) {
 923                                                 TCP_PAGE(sk) = page;
 924                                                 TCP_OFF(sk) = 0;
 925                                         }
 926                                         goto do_error;
 927                                 }
 928
 929                                 /* Update the skb. */
 930                                 if (merge) {
 931                                         skb_shinfo(skb)->frags[i - 1].size +=
 932                                                                         copy;
 933                                 } else {
 934                                         skb_fill_page_desc(skb, i, page, off, copy);
 935                                         if (TCP_PAGE(sk)) {
 936                                                 get_page(page);
 937                                         } else if (off + copy < PAGE_SIZE) {
 938                                                 get_page(page);
 939                                                 TCP_PAGE(sk) = page;
 940                                         }
 941                                 }
 942
 943                                 TCP_OFF(sk) = off + copy;
 944                         }
 945
 946                         if (!copied)
 947                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 948
 949                         tp->write_seq += copy;
 950                         TCP_SKB_CB(skb)->end_seq += copy;
 951                         skb_shinfo(skb)->tso_segs = 0;
 952
 953                         from += copy;
 954                         copied += copy;
 955                         if ((seglen -= copy) == 0 && iovlen == 0)
 956                                 goto out;
 957
 958                         if (skb->len != mss_now || (flags & MSG_OOB))
 959                                 continue;
 960
 961                         if (forced_push(tp)) {
 962                                 tcp_mark_push(tp, skb);
 963                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 964                         } else if (skb == sk->sk_send_head)
 965                                 tcp_push_one(sk, mss_now);
 966                         continue;
 967
 968 wait_for_sndbuf:
 969                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 970 wait_for_memory:
 971                         if (copied)
 972                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 973
 974                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 975                                 goto do_error;
 976
 977                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 978                 }
 979         }
 980
 981 out:
 982         if (copied)
 983                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 984         TCP_CHECK_TIMER(sk);
 985         release_sock(sk);
 986         return copied;
 987
 988 do_fault:
 989         if (!skb->len) {
 990                 if (sk->sk_send_head == skb)
 991                         sk->sk_send_head = NULL;
 992                 __skb_unlink(skb, skb->list);
 993                 sk_stream_free_skb(sk, skb);
 994         }
 995
 996 do_error:
 997         if (copied)
 998                 goto out;
 999 out_err:
1000         err = sk_stream_error(sk, flags, err);
1001         TCP_CHECK_TIMER(sk);
1002         release_sock(sk);
1003         return err;
1004 }
1005
1006 /*
1007  *      Handle reading urgent data. BSD has very simple semantics for
1008  *      this, no blocking and very strange errors 8)
1009  */
1010
1011 static int tcp_recv_urg(struct sock *sk, long timeo,
1012                         struct msghdr *msg, int len, int flags,
1013                         int *addr_len)
1014 {
1015         struct tcp_sock *tp = tcp_sk(sk);
1016
1017         /* No URG data to read. */
1018         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1019             tp->urg_data == TCP_URG_READ)
1020                 return -EINVAL; /* Yes this is right ! */
1021
1022         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1023                 return -ENOTCONN;
1024
1025         if (tp->urg_data & TCP_URG_VALID) {
1026                 int err = 0;
1027                 char c = tp->urg_data;
1028
1029                 if (!(flags & MSG_PEEK))
1030                         tp->urg_data = TCP_URG_READ;
1031
1032                 /* Read urgent data. */
1033                 msg->msg_flags |= MSG_OOB;
1034
1035                 if (len > 0) {
1036                         if (!(flags & MSG_TRUNC))
1037                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1038                         len = 1;
1039                 } else
1040                         msg->msg_flags |= MSG_TRUNC;
1041
1042                 return err ? -EFAULT : len;
1043         }
1044
1045         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1046                 return 0;
1047
1048         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1049          * the available implementations agree in this case:
1050          * this call should never block, independent of the
1051          * blocking state of the socket.
1052          * Mike <pall@rz.uni-karlsruhe.de>
1053          */
1054         return -EAGAIN;
1055 }
1056
1057 /* Clean up the receive buffer for full frames taken by the user,
1058  * then send an ACK if necessary.  COPIED is the number of bytes
1059  * tcp_recvmsg has given to the user so far, it speeds up the
1060  * calculation of whether or not we must ACK for the sake of
1061  * a window update.
1062  */
1063 static void cleanup_rbuf(struct sock *sk, int copied)
1064 {
1065         struct tcp_sock *tp = tcp_sk(sk);
1066         int time_to_ack = 0;
1067
1068 #if TCP_DEBUG
1069         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1070
1071         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1072 #endif
1073
1074         if (tcp_ack_scheduled(tp)) {
1075                    /* Delayed ACKs frequently hit locked sockets during bulk
1076                     * receive. */
1077                 if (tp->ack.blocked ||
1078                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1079                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1080                     /*
1081                      * If this read emptied read buffer, we send ACK, if
1082                      * connection is not bidirectional, user drained
1083                      * receive buffer and there was a small segment
1084                      * in queue.
1085                      */
1086                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1087                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1088                         time_to_ack = 1;
1089         }
1090
1091         /* We send an ACK if we can now advertise a non-zero window
1092          * which has been raised "significantly".
1093          *
1094          * Even if window raised up to infinity, do not send window open ACK
1095          * in states, where we will not receive more. It is useless.
1096          */
1097         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1098                 __u32 rcv_window_now = tcp_receive_window(tp);
1099
1100                 /* Optimize, __tcp_select_window() is not cheap. */
1101                 if (2*rcv_window_now <= tp->window_clamp) {
1102                         __u32 new_window = __tcp_select_window(sk);
1103
1104                         /* Send ACK now, if this read freed lots of space
1105                          * in our buffer. Certainly, new_window is new window.
1106                          * We can advertise it now, if it is not less than current one.
1107                          * "Lots" means "at least twice" here.
1108                          */
1109                         if (new_window && new_window >= 2 * rcv_window_now)
1110                                 time_to_ack = 1;
1111                 }
1112         }
1113         if (time_to_ack)
1114                 tcp_send_ack(sk);
1115 }
1116
1117 static void tcp_prequeue_process(struct sock *sk)
1118 {
1119         struct sk_buff *skb;
1120         struct tcp_sock *tp = tcp_sk(sk);
1121
1122         NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1123
1124         /* RX process wants to run with disabled BHs, though it is not
1125          * necessary */
1126         local_bh_disable();
1127         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1128                 sk->sk_backlog_rcv(sk, skb);
1129         local_bh_enable();
1130
1131         /* Clear memory counter. */
1132         tp->ucopy.memory = 0;
1133 }
1134
1135 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1136 {
1137         struct sk_buff *skb;
1138         u32 offset;
1139
1140         skb_queue_walk(&sk->sk_receive_queue, skb) {
1141                 offset = seq - TCP_SKB_CB(skb)->seq;
1142                 if (skb->h.th->syn)
1143                         offset--;
1144                 if (offset < skb->len || skb->h.th->fin) {
1145                         *off = offset;
1146                         return skb;
1147                 }
1148         }
1149         return NULL;
1150 }
1151
1152 /*
1153  * This routine provides an alternative to tcp_recvmsg() for routines
1154  * that would like to handle copying from skbuffs directly in 'sendfile'
1155  * fashion.
1156  * Note:
1157  *      - It is assumed that the socket was locked by the caller.
1158  *      - The routine does not block.
1159  *      - At present, there is no support for reading OOB data
1160  *        or for 'peeking' the socket using this routine
1161  *        (although both would be easy to implement).
1162  */
1163 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1164                   sk_read_actor_t recv_actor)
1165 {
1166         struct sk_buff *skb;
1167         struct tcp_sock *tp = tcp_sk(sk);
1168         u32 seq = tp->copied_seq;
1169         u32 offset;
1170         int copied = 0;
1171
1172         if (sk->sk_state == TCP_LISTEN)
1173                 return -ENOTCONN;
1174         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1175                 if (offset < skb->len) {
1176                         size_t used, len;
1177
1178                         len = skb->len - offset;
1179                         /* Stop reading if we hit a patch of urgent data */
1180                         if (tp->urg_data) {
1181                                 u32 urg_offset = tp->urg_seq - seq;
1182                                 if (urg_offset < len)
1183                                         len = urg_offset;
1184                                 if (!len)
1185                                         break;
1186                         }
1187                         used = recv_actor(desc, skb, offset, len);
1188                         if (used <= len) {
1189                                 seq += used;
1190                                 copied += used;
1191                                 offset += used;
1192                         }
1193                         if (offset != skb->len)
1194                                 break;
1195                 }
1196                 if (skb->h.th->fin) {
1197                         sk_eat_skb(sk, skb);
1198                         ++seq;
1199                         break;
1200                 }
1201                 sk_eat_skb(sk, skb);
1202                 if (!desc->count)
1203                         break;
1204         }
1205         tp->copied_seq = seq;
1206
1207         tcp_rcv_space_adjust(sk);
1208
1209         /* Clean up data we have read: This will do ACK frames. */
1210         if (copied)
1211                 cleanup_rbuf(sk, copied);
1212         return copied;
1213 }
1214
1215 /*
1216  *      This routine copies from a sock struct into the user buffer.
1217  *
1218  *      Technical note: in 2.3 we work on _locked_ socket, so that
1219  *      tricks with *seq access order and skb->users are not required.
1220  *      Probably, code can be easily improved even more.
1221  */
1222
1223 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1224                 size_t len, int nonblock, int flags, int *addr_len)
1225 {
1226         struct tcp_sock *tp = tcp_sk(sk);
1227         int copied = 0;
1228         u32 peek_seq;
1229         u32 *seq;
1230         unsigned long used;
1231         int err;
1232         int target;             /* Read at least this many bytes */
1233         long timeo;
1234         struct task_struct *user_recv = NULL;
1235
1236         lock_sock(sk);
1237
1238         TCP_CHECK_TIMER(sk);
1239
1240         err = -ENOTCONN;
1241         if (sk->sk_state == TCP_LISTEN)
1242                 goto out;
1243
1244         timeo = sock_rcvtimeo(sk, nonblock);
1245
1246         /* Urgent data needs to be handled specially. */
1247         if (flags & MSG_OOB)
1248                 goto recv_urg;
1249
1250         seq = &tp->copied_seq;
1251         if (flags & MSG_PEEK) {
1252                 peek_seq = tp->copied_seq;
1253                 seq = &peek_seq;
1254         }
1255
1256         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1257
1258         do {
1259                 struct sk_buff *skb;
1260                 u32 offset;
1261
1262                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1263                 if (tp->urg_data && tp->urg_seq == *seq) {
1264                         if (copied)
1265                                 break;
1266                         if (signal_pending(current)) {
1267                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1268                                 break;
1269                         }
1270                 }
1271
1272                 /* Next get a buffer. */
1273
1274                 skb = skb_peek(&sk->sk_receive_queue);
1275                 do {
1276                         if (!skb)
1277                                 break;
1278
1279                         /* Now that we have two receive queues this
1280                          * shouldn't happen.
1281                          */
1282                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1283                                 printk(KERN_INFO "recvmsg bug: copied %X "
1284                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1285                                 break;
1286                         }
1287                         offset = *seq - TCP_SKB_CB(skb)->seq;
1288                         if (skb->h.th->syn)
1289                                 offset--;
1290                         if (offset < skb->len)
1291                                 goto found_ok_skb;
1292                         if (skb->h.th->fin)
1293                                 goto found_fin_ok;
1294                         BUG_TRAP(flags & MSG_PEEK);
1295                         skb = skb->next;
1296                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1297
1298                 /* Well, if we have backlog, try to process it now yet. */
1299
1300                 if (copied >= target && !sk->sk_backlog.tail)
1301                         break;
1302
1303                 if (copied) {
1304                         if (sk->sk_err ||
1305                             sk->sk_state == TCP_CLOSE ||
1306                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1307                             !timeo ||
1308                             signal_pending(current) ||
1309                             (flags & MSG_PEEK))
1310                                 break;
1311                 } else {
1312                         if (sock_flag(sk, SOCK_DONE))
1313                                 break;
1314
1315                         if (sk->sk_err) {
1316                                 copied = sock_error(sk);
1317                                 break;
1318                         }
1319
1320                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1321                                 break;
1322
1323                         if (sk->sk_state == TCP_CLOSE) {
1324                                 if (!sock_flag(sk, SOCK_DONE)) {
1325                                         /* This occurs when user tries to read
1326                                          * from never connected socket.
1327                                          */
1328                                         copied = -ENOTCONN;
1329                                         break;
1330                                 }
1331                                 break;
1332                         }
1333
1334                         if (!timeo) {
1335                                 copied = -EAGAIN;
1336                                 break;
1337                         }
1338
1339                         if (signal_pending(current)) {
1340                                 copied = sock_intr_errno(timeo);
1341                                 break;
1342                         }
1343                 }
1344
1345                 cleanup_rbuf(sk, copied);
1346
1347                 if (tp->ucopy.task == user_recv) {
1348                         /* Install new reader */
1349                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1350                                 user_recv = current;
1351                                 tp->ucopy.task = user_recv;
1352                                 tp->ucopy.iov = msg->msg_iov;
1353                         }
1354
1355                         tp->ucopy.len = len;
1356
1357                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1358                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1359
1360                         /* Ugly... If prequeue is not empty, we have to
1361                          * process it before releasing socket, otherwise
1362                          * order will be broken at second iteration.
1363                          * More elegant solution is required!!!
1364                          *
1365                          * Look: we have the following (pseudo)queues:
1366                          *
1367                          * 1. packets in flight
1368                          * 2. backlog
1369                          * 3. prequeue
1370                          * 4. receive_queue
1371                          *
1372                          * Each queue can be processed only if the next ones
1373                          * are empty. At this point we have empty receive_queue.
1374                          * But prequeue _can_ be not empty after 2nd iteration,
1375                          * when we jumped to start of loop because backlog
1376                          * processing added something to receive_queue.
1377                          * We cannot release_sock(), because backlog contains
1378                          * packets arrived _after_ prequeued ones.
1379                          *
1380                          * Shortly, algorithm is clear --- to process all
1381                          * the queues in order. We could make it more directly,
1382                          * requeueing packets from backlog to prequeue, if
1383                          * is not empty. It is more elegant, but eats cycles,
1384                          * unfortunately.
1385                          */
1386                         if (skb_queue_len(&tp->ucopy.prequeue))
1387                                 goto do_prequeue;
1388
1389                         /* __ Set realtime policy in scheduler __ */
1390                 }
1391
1392                 if (copied >= target) {
1393                         /* Do not sleep, just process backlog. */
1394                         release_sock(sk);
1395                         lock_sock(sk);
1396                 } else
1397                         sk_wait_data(sk, &timeo);
1398
1399                 if (user_recv) {
1400                         int chunk;
1401
1402                         /* __ Restore normal policy in scheduler __ */
1403
1404                         if ((chunk = len - tp->ucopy.len) != 0) {
1405                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1406                                 len -= chunk;
1407                                 copied += chunk;
1408                         }
1409
1410                         if (tp->rcv_nxt == tp->copied_seq &&
1411                             skb_queue_len(&tp->ucopy.prequeue)) {
1412 do_prequeue:
1413                                 tcp_prequeue_process(sk);
1414
1415                                 if ((chunk = len - tp->ucopy.len) != 0) {
1416                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1417                                         len -= chunk;
1418                                         copied += chunk;
1419                                 }
1420                         }
1421                 }
1422                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1423                         if (net_ratelimit())
1424                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1425                                        current->comm, current->pid);
1426                         peek_seq = tp->copied_seq;
1427                 }
1428                 continue;
1429
1430         found_ok_skb:
1431                 /* Ok so how much can we use? */
1432                 used = skb->len - offset;
1433                 if (len < used)
1434                         used = len;
1435
1436                 /* Do we have urgent data here? */
1437                 if (tp->urg_data) {
1438                         u32 urg_offset = tp->urg_seq - *seq;
1439                         if (urg_offset < used) {
1440                                 if (!urg_offset) {
1441                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1442                                                 ++*seq;
1443                                                 offset++;
1444                                                 used--;
1445                                                 if (!used)
1446                                                         goto skip_copy;
1447                                         }
1448                                 } else
1449                                         used = urg_offset;
1450                         }
1451                 }
1452
1453                 if (!(flags & MSG_TRUNC)) {
1454                         err = skb_copy_datagram_iovec(skb, offset,
1455                                                       msg->msg_iov, used);
1456                         if (err) {
1457                                 /* Exception. Bailout! */
1458                                 if (!copied)
1459                                         copied = -EFAULT;
1460                                 break;
1461                         }
1462                 }
1463
1464                 *seq += used;
1465                 copied += used;
1466                 len -= used;
1467
1468                 tcp_rcv_space_adjust(sk);
1469
1470 skip_copy:
1471                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1472                         tp->urg_data = 0;
1473                         tcp_fast_path_check(sk, tp);
1474                 }
1475                 if (used + offset < skb->len)
1476                         continue;
1477
1478                 if (skb->h.th->fin)
1479                         goto found_fin_ok;
1480                 if (!(flags & MSG_PEEK))
1481                         sk_eat_skb(sk, skb);
1482                 continue;
1483
1484         found_fin_ok:
1485                 /* Process the FIN. */
1486                 ++*seq;
1487                 if (!(flags & MSG_PEEK))
1488                         sk_eat_skb(sk, skb);
1489                 break;
1490         } while (len > 0);
1491
1492         if (user_recv) {
1493                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1494                         int chunk;
1495
1496                         tp->ucopy.len = copied > 0 ? len : 0;
1497
1498                         tcp_prequeue_process(sk);
1499
1500                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1501                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1502                                 len -= chunk;
1503                                 copied += chunk;
1504                         }
1505                 }
1506
1507                 tp->ucopy.task = NULL;
1508                 tp->ucopy.len = 0;
1509         }
1510
1511         /* According to UNIX98, msg_name/msg_namelen are ignored
1512          * on connected socket. I was just happy when found this 8) --ANK
1513          */
1514
1515         /* Clean up data we have read: This will do ACK frames. */
1516         cleanup_rbuf(sk, copied);
1517
1518         TCP_CHECK_TIMER(sk);
1519         release_sock(sk);
1520         return copied;
1521
1522 out:
1523         TCP_CHECK_TIMER(sk);
1524         release_sock(sk);
1525         return err;
1526
1527 recv_urg:
1528         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1529         goto out;
1530 }
1531
1532 /*
1533  *      State processing on a close. This implements the state shift for
1534  *      sending our FIN frame. Note that we only send a FIN for some
1535  *      states. A shutdown() may have already sent the FIN, or we may be
1536  *      closed.
1537  */
1538
1539 static unsigned char new_state[16] = {
1540   /* current state:        new state:      action:      */
1541   /* (Invalid)          */ TCP_CLOSE,
1542   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1543   /* TCP_SYN_SENT       */ TCP_CLOSE,
1544   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1545   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1546   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1547   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1548   /* TCP_CLOSE          */ TCP_CLOSE,
1549   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1550   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1551   /* TCP_LISTEN         */ TCP_CLOSE,
1552   /* TCP_CLOSING        */ TCP_CLOSING,
1553 };
1554
1555 static int tcp_close_state(struct sock *sk)
1556 {
1557         int next = (int)new_state[sk->sk_state];
1558         int ns = next & TCP_STATE_MASK;
1559
1560         tcp_set_state(sk, ns);
1561
1562         return next & TCP_ACTION_FIN;
1563 }
1564
1565 /*
1566  *      Shutdown the sending side of a connection. Much like close except
1567  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1568  */
1569
1570 void tcp_shutdown(struct sock *sk, int how)
1571 {
1572         /*      We need to grab some memory, and put together a FIN,
1573          *      and then put it into the queue to be sent.
1574          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1575          */
1576         if (!(how & SEND_SHUTDOWN))
1577                 return;
1578
1579         /* If we've already sent a FIN, or it's a closed state, skip this. */
1580         if ((1 << sk->sk_state) &
1581             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1582              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1583                 /* Clear out any half completed packets.  FIN if needed. */
1584                 if (tcp_close_state(sk))
1585                         tcp_send_fin(sk);
1586         }
1587 }
1588
1589 /*
1590  * At this point, there should be no process reference to this
1591  * socket, and thus no user references at all.  Therefore we
1592  * can assume the socket waitqueue is inactive and nobody will
1593  * try to jump onto it.
1594  */
1595 void tcp_destroy_sock(struct sock *sk)
1596 {
1597         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1598         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1599
1600         /* It cannot be in hash table! */
1601         BUG_TRAP(sk_unhashed(sk));
1602
1603         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1604         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1605
1606         sk->sk_prot->destroy(sk);
1607
1608         sk_stream_kill_queues(sk);
1609
1610         xfrm_sk_free_policy(sk);
1611
1612 #ifdef INET_REFCNT_DEBUG
1613         if (atomic_read(&sk->sk_refcnt) != 1) {
1614                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1615                        sk, atomic_read(&sk->sk_refcnt));
1616         }
1617 #endif
1618
1619         atomic_dec(&tcp_orphan_count);
1620         sock_put(sk);
1621 }
1622
1623 void tcp_close(struct sock *sk, long timeout)
1624 {
1625         struct sk_buff *skb;
1626         int data_was_unread = 0;
1627
1628         lock_sock(sk);
1629         sk->sk_shutdown = SHUTDOWN_MASK;
1630
1631         if (sk->sk_state == TCP_LISTEN) {
1632                 tcp_set_state(sk, TCP_CLOSE);
1633
1634                 /* Special case. */
1635                 tcp_listen_stop(sk);
1636
1637                 goto adjudge_to_death;
1638         }
1639
1640         /*  We need to flush the recv. buffs.  We do this only on the
1641          *  descriptor close, not protocol-sourced closes, because the
1642          *  reader process may not have drained the data yet!
1643          */
1644         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1645                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1646                           skb->h.th->fin;
1647                 data_was_unread += len;
1648                 __kfree_skb(skb);
1649         }
1650
1651         sk_stream_mem_reclaim(sk);
1652
1653         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1654          * 3.10, we send a RST here because data was lost.  To
1655          * witness the awful effects of the old behavior of always
1656          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1657          * a bulk GET in an FTP client, suspend the process, wait
1658          * for the client to advertise a zero window, then kill -9
1659          * the FTP client, wheee...  Note: timeout is always zero
1660          * in such a case.
1661          */
1662         if (data_was_unread) {
1663                 /* Unread data was tossed, zap the connection. */
1664                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1665                 tcp_set_state(sk, TCP_CLOSE);
1666                 tcp_send_active_reset(sk, GFP_KERNEL);
1667         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1668                 /* Check zero linger _after_ checking for unread data. */
1669                 sk->sk_prot->disconnect(sk, 0);
1670                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1671         } else if (tcp_close_state(sk)) {
1672                 /* We FIN if the application ate all the data before
1673                  * zapping the connection.
1674                  */
1675
1676                 /* RED-PEN. Formally speaking, we have broken TCP state
1677                  * machine. State transitions:
1678                  *
1679                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1680                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1681                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1682                  *
1683                  * are legal only when FIN has been sent (i.e. in window),
1684                  * rather than queued out of window. Purists blame.
1685                  *
1686                  * F.e. "RFC state" is ESTABLISHED,
1687                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1688                  *
1689                  * The visible declinations are that sometimes
1690                  * we enter time-wait state, when it is not required really
1691                  * (harmless), do not send active resets, when they are
1692                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1693                  * they look as CLOSING or LAST_ACK for Linux)
1694                  * Probably, I missed some more holelets.
1695                  *                                              --ANK
1696                  */
1697                 tcp_send_fin(sk);
1698         }
1699
1700         sk_stream_wait_close(sk, timeout);
1701
1702 adjudge_to_death:
1703         /* It is the last release_sock in its life. It will remove backlog. */
1704         release_sock(sk);
1705
1706
1707         /* Now socket is owned by kernel and we acquire BH lock
1708            to finish close. No need to check for user refs.
1709          */
1710         local_bh_disable();
1711         bh_lock_sock(sk);
1712         BUG_TRAP(!sock_owned_by_user(sk));
1713
1714         sock_hold(sk);
1715         sock_orphan(sk);
1716
1717         /*      This is a (useful) BSD violating of the RFC. There is a
1718          *      problem with TCP as specified in that the other end could
1719          *      keep a socket open forever with no application left this end.
1720          *      We use a 3 minute timeout (about the same as BSD) then kill
1721          *      our end. If they send after that then tough - BUT: long enough
1722          *      that we won't make the old 4*rto = almost no time - whoops
1723          *      reset mistake.
1724          *
1725          *      Nope, it was not mistake. It is really desired behaviour
1726          *      f.e. on http servers, when such sockets are useless, but
1727          *      consume significant resources. Let's do it with special
1728          *      linger2 option.                                 --ANK
1729          */
1730
1731         if (sk->sk_state == TCP_FIN_WAIT2) {
1732                 struct tcp_sock *tp = tcp_sk(sk);
1733                 if (tp->linger2 < 0) {
1734                         tcp_set_state(sk, TCP_CLOSE);
1735                         tcp_send_active_reset(sk, GFP_ATOMIC);
1736                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1737                 } else {
1738                         int tmo = tcp_fin_time(tp);
1739
1740                         if (tmo > TCP_TIMEWAIT_LEN) {
1741                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1742                         } else {
1743                                 atomic_inc(&tcp_orphan_count);
1744                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1745                                 goto out;
1746                         }
1747                 }
1748         }
1749         if (sk->sk_state != TCP_CLOSE) {
1750                 sk_stream_mem_reclaim(sk);
1751                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1752                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1753                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1754                         if (net_ratelimit())
1755                                 printk(KERN_INFO "TCP: too many of orphaned "
1756                                        "sockets\n");
1757                         tcp_set_state(sk, TCP_CLOSE);
1758                         tcp_send_active_reset(sk, GFP_ATOMIC);
1759                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1760                 }
1761         }
1762         atomic_inc(&tcp_orphan_count);
1763
1764         if (sk->sk_state == TCP_CLOSE)
1765                 tcp_destroy_sock(sk);
1766         /* Otherwise, socket is reprieved until protocol close. */
1767
1768 out:
1769         bh_unlock_sock(sk);
1770         local_bh_enable();
1771         sock_put(sk);
1772 }
1773
1774 /* These states need RST on ABORT according to RFC793 */
1775
1776 static inline int tcp_need_reset(int state)
1777 {
1778         return (1 << state) &
1779                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1780                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1781 }
1782
1783 int tcp_disconnect(struct sock *sk, int flags)
1784 {
1785         struct inet_sock *inet = inet_sk(sk);
1786         struct tcp_sock *tp = tcp_sk(sk);
1787         int err = 0;
1788         int old_state = sk->sk_state;
1789
1790         if (old_state != TCP_CLOSE)
1791                 tcp_set_state(sk, TCP_CLOSE);
1792
1793         /* ABORT function of RFC793 */
1794         if (old_state == TCP_LISTEN) {
1795                 tcp_listen_stop(sk);
1796         } else if (tcp_need_reset(old_state) ||
1797                    (tp->snd_nxt != tp->write_seq &&
1798                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1799                 /* The last check adjusts for discrepance of Linux wrt. RFC
1800                  * states
1801                  */
1802                 tcp_send_active_reset(sk, gfp_any());
1803                 sk->sk_err = ECONNRESET;
1804         } else if (old_state == TCP_SYN_SENT)
1805                 sk->sk_err = ECONNRESET;
1806
1807         tcp_clear_xmit_timers(sk);
1808         __skb_queue_purge(&sk->sk_receive_queue);
1809         sk_stream_writequeue_purge(sk);
1810         __skb_queue_purge(&tp->out_of_order_queue);
1811
1812         inet->dport = 0;
1813
1814         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1815                 inet_reset_saddr(sk);
1816
1817         sk->sk_shutdown = 0;
1818         sock_reset_flag(sk, SOCK_DONE);
1819         tp->srtt = 0;
1820         if ((tp->write_seq += tp->max_window + 2) == 0)
1821                 tp->write_seq = 1;
1822         tp->backoff = 0;
1823         tp->snd_cwnd = 2;
1824         tp->probes_out = 0;
1825         tp->packets_out = 0;
1826         tp->snd_ssthresh = 0x7fffffff;
1827         tp->snd_cwnd_cnt = 0;
1828         tcp_set_ca_state(tp, TCP_CA_Open);
1829         tcp_clear_retrans(tp);
1830         tcp_delack_init(tp);
1831         sk->sk_send_head = NULL;
1832         tp->rx_opt.saw_tstamp = 0;
1833         tcp_sack_reset(&tp->rx_opt);
1834         __sk_dst_reset(sk);
1835
1836         BUG_TRAP(!inet->num || tp->bind_hash);
1837
1838         sk->sk_error_report(sk);
1839         return err;
1840 }
1841
1842 /*
1843  *      Wait for an incoming connection, avoid race
1844  *      conditions. This must be called with the socket locked.
1845  */
1846 static int wait_for_connect(struct sock *sk, long timeo)
1847 {
1848         struct tcp_sock *tp = tcp_sk(sk);
1849         DEFINE_WAIT(wait);
1850         int err;
1851
1852         /*
1853          * True wake-one mechanism for incoming connections: only
1854          * one process gets woken up, not the 'whole herd'.
1855          * Since we do not 'race & poll' for established sockets
1856          * anymore, the common case will execute the loop only once.
1857          *
1858          * Subtle issue: "add_wait_queue_exclusive()" will be added
1859          * after any current non-exclusive waiters, and we know that
1860          * it will always _stay_ after any new non-exclusive waiters
1861          * because all non-exclusive waiters are added at the
1862          * beginning of the wait-queue. As such, it's ok to "drop"
1863          * our exclusiveness temporarily when we get woken up without
1864          * having to remove and re-insert us on the wait queue.
1865          */
1866         for (;;) {
1867                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1868                                           TASK_INTERRUPTIBLE);
1869                 release_sock(sk);
1870                 if (!tp->accept_queue)
1871                         timeo = schedule_timeout(timeo);
1872                 lock_sock(sk);
1873                 err = 0;
1874                 if (tp->accept_queue)
1875                         break;
1876                 err = -EINVAL;
1877                 if (sk->sk_state != TCP_LISTEN)
1878                         break;
1879                 err = sock_intr_errno(timeo);
1880                 if (signal_pending(current))
1881                         break;
1882                 err = -EAGAIN;
1883                 if (!timeo)
1884                         break;
1885         }
1886         finish_wait(sk->sk_sleep, &wait);
1887         return err;
1888 }
1889
1890 /*
1891  *      This will accept the next outstanding connection.
1892  */
1893
1894 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1895 {
1896         struct tcp_sock *tp = tcp_sk(sk);
1897         struct request_sock *req;
1898         struct sock *newsk;
1899         int error;
1900
1901         lock_sock(sk);
1902
1903         /* We need to make sure that this socket is listening,
1904          * and that it has something pending.
1905          */
1906         error = -EINVAL;
1907         if (sk->sk_state != TCP_LISTEN)
1908                 goto out;
1909
1910         /* Find already established connection */
1911         if (!tp->accept_queue) {
1912                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1913
1914                 /* If this is a non blocking socket don't sleep */
1915                 error = -EAGAIN;
1916                 if (!timeo)
1917                         goto out;
1918
1919                 error = wait_for_connect(sk, timeo);
1920                 if (error)
1921                         goto out;
1922         }
1923
1924         req = tp->accept_queue;
1925         if ((tp->accept_queue = req->dl_next) == NULL)
1926                 tp->accept_queue_tail = NULL;
1927
1928         newsk = req->sk;
1929         sk_acceptq_removed(sk);
1930         __reqsk_free(req);
1931         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1932         release_sock(sk);
1933         return newsk;
1934
1935 out:
1936         release_sock(sk);
1937         *err = error;
1938         return NULL;
1939 }
1940
1941 /*
1942  *      Socket option code for TCP.
1943  */
1944 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1945                    int optlen)
1946 {
1947         struct tcp_sock *tp = tcp_sk(sk);
1948         int val;
1949         int err = 0;
1950
1951         if (level != SOL_TCP)
1952                 return tp->af_specific->setsockopt(sk, level, optname,
1953                                                    optval, optlen);
1954
1955         if (optlen < sizeof(int))
1956                 return -EINVAL;
1957
1958         if (get_user(val, (int __user *)optval))
1959                 return -EFAULT;
1960
1961         lock_sock(sk);
1962
1963         switch (optname) {
1964         case TCP_MAXSEG:
1965                 /* Values greater than interface MTU won't take effect. However
1966                  * at the point when this call is done we typically don't yet
1967                  * know which interface is going to be used */
1968                 if (val < 8 || val > MAX_TCP_WINDOW) {
1969                         err = -EINVAL;
1970                         break;
1971                 }
1972                 tp->rx_opt.user_mss = val;
1973                 break;
1974
1975         case TCP_NODELAY:
1976                 if (val) {
1977                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1978                          * this option on corked socket is remembered, but
1979                          * it is not activated until cork is cleared.
1980                          *
1981                          * However, when TCP_NODELAY is set we make
1982                          * an explicit push, which overrides even TCP_CORK
1983                          * for currently queued segments.
1984                          */
1985                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1986                         tcp_push_pending_frames(sk, tp);
1987                 } else {
1988                         tp->nonagle &= ~TCP_NAGLE_OFF;
1989                 }
1990                 break;
1991
1992         case TCP_CORK:
1993                 /* When set indicates to always queue non-full frames.
1994                  * Later the user clears this option and we transmit
1995                  * any pending partial frames in the queue.  This is
1996                  * meant to be used alongside sendfile() to get properly
1997                  * filled frames when the user (for example) must write
1998                  * out headers with a write() call first and then use
1999                  * sendfile to send out the data parts.
2000                  *
2001                  * TCP_CORK can be set together with TCP_NODELAY and it is
2002                  * stronger than TCP_NODELAY.
2003                  */
2004                 if (val) {
2005                         tp->nonagle |= TCP_NAGLE_CORK;
2006                 } else {
2007                         tp->nonagle &= ~TCP_NAGLE_CORK;
2008                         if (tp->nonagle&TCP_NAGLE_OFF)
2009                                 tp->nonagle |= TCP_NAGLE_PUSH;
2010                         tcp_push_pending_frames(sk, tp);
2011                 }
2012                 break;
2013
2014         case TCP_KEEPIDLE:
2015                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2016                         err = -EINVAL;
2017                 else {
2018                         tp->keepalive_time = val * HZ;
2019                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2020                             !((1 << sk->sk_state) &
2021                               (TCPF_CLOSE | TCPF_LISTEN))) {
2022                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2023                                 if (tp->keepalive_time > elapsed)
2024                                         elapsed = tp->keepalive_time - elapsed;
2025                                 else
2026                                         elapsed = 0;
2027                                 tcp_reset_keepalive_timer(sk, elapsed);
2028                         }
2029                 }
2030                 break;
2031         case TCP_KEEPINTVL:
2032                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2033                         err = -EINVAL;
2034                 else
2035                         tp->keepalive_intvl = val * HZ;
2036                 break;
2037         case TCP_KEEPCNT:
2038                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2039                         err = -EINVAL;
2040                 else
2041                         tp->keepalive_probes = val;
2042                 break;
2043         case TCP_SYNCNT:
2044                 if (val < 1 || val > MAX_TCP_SYNCNT)
2045                         err = -EINVAL;
2046                 else
2047                         tp->syn_retries = val;
2048                 break;
2049
2050         case TCP_LINGER2:
2051                 if (val < 0)
2052                         tp->linger2 = -1;
2053                 else if (val > sysctl_tcp_fin_timeout / HZ)
2054                         tp->linger2 = 0;
2055                 else
2056                         tp->linger2 = val * HZ;
2057                 break;
2058
2059         case TCP_DEFER_ACCEPT:
2060                 tp->defer_accept = 0;
2061                 if (val > 0) {
2062                         /* Translate value in seconds to number of
2063                          * retransmits */
2064                         while (tp->defer_accept < 32 &&
2065                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2066                                        tp->defer_accept))
2067                                 tp->defer_accept++;
2068                         tp->defer_accept++;
2069                 }
2070                 break;
2071
2072         case TCP_WINDOW_CLAMP:
2073                 if (!val) {
2074                         if (sk->sk_state != TCP_CLOSE) {
2075                                 err = -EINVAL;
2076                                 break;
2077                         }
2078                         tp->window_clamp = 0;
2079                 } else
2080                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2081                                                 SOCK_MIN_RCVBUF / 2 : val;
2082                 break;
2083
2084         case TCP_QUICKACK:
2085                 if (!val) {
2086                         tp->ack.pingpong = 1;
2087                 } else {
2088                         tp->ack.pingpong = 0;
2089                         if ((1 << sk->sk_state) &
2090                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2091                             tcp_ack_scheduled(tp)) {
2092                                 tp->ack.pending |= TCP_ACK_PUSHED;
2093                                 cleanup_rbuf(sk, 1);
2094                                 if (!(val & 1))
2095                                         tp->ack.pingpong = 1;
2096                         }
2097                 }
2098                 break;
2099
2100         default:
2101                 err = -ENOPROTOOPT;
2102                 break;
2103         };
2104         release_sock(sk);
2105         return err;
2106 }
2107
2108 /* Return information about state of tcp endpoint in API format. */
2109 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2110 {
2111         struct tcp_sock *tp = tcp_sk(sk);
2112         u32 now = tcp_time_stamp;
2113
2114         memset(info, 0, sizeof(*info));
2115
2116         info->tcpi_state = sk->sk_state;
2117         info->tcpi_ca_state = tp->ca_state;
2118         info->tcpi_retransmits = tp->retransmits;
2119         info->tcpi_probes = tp->probes_out;
2120         info->tcpi_backoff = tp->backoff;
2121
2122         if (tp->rx_opt.tstamp_ok)
2123                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2124         if (tp->rx_opt.sack_ok)
2125                 info->tcpi_options |= TCPI_OPT_SACK;
2126         if (tp->rx_opt.wscale_ok) {
2127                 info->tcpi_options |= TCPI_OPT_WSCALE;
2128                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2129                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2130         }
2131
2132         if (tp->ecn_flags&TCP_ECN_OK)
2133                 info->tcpi_options |= TCPI_OPT_ECN;
2134
2135         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2136         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2137         info->tcpi_snd_mss = tp->mss_cache_std;
2138         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2139
2140         info->tcpi_unacked = tp->packets_out;
2141         info->tcpi_sacked = tp->sacked_out;
2142         info->tcpi_lost = tp->lost_out;
2143         info->tcpi_retrans = tp->retrans_out;
2144         info->tcpi_fackets = tp->fackets_out;
2145
2146         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2147         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2148         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2149
2150         info->tcpi_pmtu = tp->pmtu_cookie;
2151         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2152         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2153         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2154         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2155         info->tcpi_snd_cwnd = tp->snd_cwnd;
2156         info->tcpi_advmss = tp->advmss;
2157         info->tcpi_reordering = tp->reordering;
2158
2159         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2160         info->tcpi_rcv_space = tp->rcvq_space.space;
2161
2162         info->tcpi_total_retrans = tp->total_retrans;
2163 }
2164
2165 EXPORT_SYMBOL_GPL(tcp_get_info);
2166
2167 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2168                    int __user *optlen)
2169 {
2170         struct tcp_sock *tp = tcp_sk(sk);
2171         int val, len;
2172
2173         if (level != SOL_TCP)
2174                 return tp->af_specific->getsockopt(sk, level, optname,
2175                                                    optval, optlen);
2176
2177         if (get_user(len, optlen))
2178                 return -EFAULT;
2179
2180         len = min_t(unsigned int, len, sizeof(int));
2181
2182         if (len < 0)
2183                 return -EINVAL;
2184
2185         switch (optname) {
2186         case TCP_MAXSEG:
2187                 val = tp->mss_cache_std;
2188                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2189                         val = tp->rx_opt.user_mss;
2190                 break;
2191         case TCP_NODELAY:
2192                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2193                 break;
2194         case TCP_CORK:
2195                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2196                 break;
2197         case TCP_KEEPIDLE:
2198                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2199                 break;
2200         case TCP_KEEPINTVL:
2201                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2202                 break;
2203         case TCP_KEEPCNT:
2204                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2205                 break;
2206         case TCP_SYNCNT:
2207                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2208                 break;
2209         case TCP_LINGER2:
2210                 val = tp->linger2;
2211                 if (val >= 0)
2212                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2213                 break;
2214         case TCP_DEFER_ACCEPT:
2215                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2216                                                (tp->defer_accept - 1));
2217                 break;
2218         case TCP_WINDOW_CLAMP:
2219                 val = tp->window_clamp;
2220                 break;
2221         case TCP_INFO: {
2222                 struct tcp_info info;
2223
2224                 if (get_user(len, optlen))
2225                         return -EFAULT;
2226
2227                 tcp_get_info(sk, &info);
2228
2229                 len = min_t(unsigned int, len, sizeof(info));
2230                 if (put_user(len, optlen))
2231                         return -EFAULT;
2232                 if (copy_to_user(optval, &info, len))
2233                         return -EFAULT;
2234                 return 0;
2235         }
2236         case TCP_QUICKACK:
2237                 val = !tp->ack.pingpong;
2238                 break;
2239         default:
2240                 return -ENOPROTOOPT;
2241         };
2242
2243         if (put_user(len, optlen))
2244                 return -EFAULT;
2245         if (copy_to_user(optval, &val, len))
2246                 return -EFAULT;
2247         return 0;
2248 }
2249
2250
2251 extern void __skb_cb_too_small_for_tcp(int, int);
2252 extern void tcpdiag_init(void);
2253
2254 static __initdata unsigned long thash_entries;
2255 static int __init set_thash_entries(char *str)
2256 {
2257         if (!str)
2258                 return 0;
2259         thash_entries = simple_strtoul(str, &str, 0);
2260         return 1;
2261 }
2262 __setup("thash_entries=", set_thash_entries);
2263
2264 void __init tcp_init(void)
2265 {
2266         struct sk_buff *skb = NULL;
2267         int order, i;
2268
2269         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2270                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2271                                            sizeof(skb->cb));
2272
2273         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2274                                               sizeof(struct tcp_bind_bucket),
2275                                               0, SLAB_HWCACHE_ALIGN,
2276                                               NULL, NULL);
2277         if (!tcp_bucket_cachep)
2278                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2279
2280         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2281                                                 sizeof(struct tcp_tw_bucket),
2282                                                 0, SLAB_HWCACHE_ALIGN,
2283                                                 NULL, NULL);
2284         if (!tcp_timewait_cachep)
2285                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2286
2287         /* Size and allocate the main established and bind bucket
2288          * hash tables.
2289          *
2290          * The methodology is similar to that of the buffer cache.
2291          */
2292         tcp_ehash = (struct tcp_ehash_bucket *)
2293                 alloc_large_system_hash("TCP established",
2294                                         sizeof(struct tcp_ehash_bucket),
2295                                         thash_entries,
2296                                         (num_physpages >= 128 * 1024) ?
2297                                                 (25 - PAGE_SHIFT) :
2298                                                 (27 - PAGE_SHIFT),
2299                                         HASH_HIGHMEM,
2300                                         &tcp_ehash_size,
2301                                         NULL,
2302                                         0);
2303         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2304         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2305                 rwlock_init(&tcp_ehash[i].lock);
2306                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2307         }
2308
2309         tcp_bhash = (struct tcp_bind_hashbucket *)
2310                 alloc_large_system_hash("TCP bind",
2311                                         sizeof(struct tcp_bind_hashbucket),
2312                                         tcp_ehash_size,
2313                                         (num_physpages >= 128 * 1024) ?
2314                                                 (25 - PAGE_SHIFT) :
2315                                                 (27 - PAGE_SHIFT),
2316                                         HASH_HIGHMEM,
2317                                         &tcp_bhash_size,
2318                                         NULL,
2319                                         64 * 1024);
2320         tcp_bhash_size = 1 << tcp_bhash_size;
2321         for (i = 0; i < tcp_bhash_size; i++) {
2322                 spin_lock_init(&tcp_bhash[i].lock);
2323                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2324         }
2325
2326         /* Try to be a bit smarter and adjust defaults depending
2327          * on available memory.
2328          */
2329         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2330                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2331                         order++)
2332                 ;
2333         if (order >= 4) {
2334                 sysctl_local_port_range[0] = 32768;
2335                 sysctl_local_port_range[1] = 61000;
2336                 sysctl_tcp_max_tw_buckets = 180000;
2337                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2338                 sysctl_max_syn_backlog = 1024;
2339         } else if (order < 3) {
2340                 sysctl_local_port_range[0] = 1024 * (3 - order);
2341                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2342                 sysctl_tcp_max_orphans >>= (3 - order);
2343                 sysctl_max_syn_backlog = 128;
2344         }
2345         tcp_port_rover = sysctl_local_port_range[0] - 1;
2346
2347         sysctl_tcp_mem[0] =  768 << order;
2348         sysctl_tcp_mem[1] = 1024 << order;
2349         sysctl_tcp_mem[2] = 1536 << order;
2350
2351         if (order < 3) {
2352                 sysctl_tcp_wmem[2] = 64 * 1024;
2353                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2354                 sysctl_tcp_rmem[1] = 43689;
2355                 sysctl_tcp_rmem[2] = 2 * 43689;
2356         }
2357
2358         printk(KERN_INFO "TCP: Hash tables configured "
2359                "(established %d bind %d)\n",
2360                tcp_ehash_size << 1, tcp_bhash_size);
2361 }
2362
2363 EXPORT_SYMBOL(tcp_accept);
2364 EXPORT_SYMBOL(tcp_close);
2365 EXPORT_SYMBOL(tcp_destroy_sock);
2366 EXPORT_SYMBOL(tcp_disconnect);
2367 EXPORT_SYMBOL(tcp_getsockopt);
2368 EXPORT_SYMBOL(tcp_ioctl);
2369 EXPORT_SYMBOL(tcp_poll);
2370 EXPORT_SYMBOL(tcp_read_sock);
2371 EXPORT_SYMBOL(tcp_recvmsg);
2372 EXPORT_SYMBOL(tcp_sendmsg);
2373 EXPORT_SYMBOL(tcp_sendpage);
2374 EXPORT_SYMBOL(tcp_setsockopt);
2375 EXPORT_SYMBOL(tcp_shutdown);
2376 EXPORT_SYMBOL(tcp_statistics);
2377 EXPORT_SYMBOL(tcp_timewait_cachep);