2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 /* Caller must disable local BH processing. */
108 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
110 struct inet_bind_hashbucket *head =
111 &tcp_bhash[inet_bhashfn(inet_sk(child)->num,
113 struct inet_bind_bucket *tb;
115 spin_lock(&head->lock);
116 tb = tcp_sk(sk)->bind_hash;
117 sk_add_bind_node(child, &tb->owners);
118 tcp_sk(child)->bind_hash = tb;
119 spin_unlock(&head->lock);
122 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
125 __tcp_inherit_port(sk, child);
129 void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
130 const unsigned short snum)
132 inet_sk(sk)->num = snum;
133 sk_add_bind_node(sk, &tb->owners);
134 tcp_sk(sk)->bind_hash = tb;
137 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
139 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
141 struct hlist_node *node;
142 int reuse = sk->sk_reuse;
144 sk_for_each_bound(sk2, node, &tb->owners) {
146 !tcp_v6_ipv6only(sk2) &&
147 (!sk->sk_bound_dev_if ||
148 !sk2->sk_bound_dev_if ||
149 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
150 if (!reuse || !sk2->sk_reuse ||
151 sk2->sk_state == TCP_LISTEN) {
152 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
153 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
154 sk2_rcv_saddr == sk_rcv_saddr)
162 /* Obtain a reference to a local port for the given sock,
163 * if snum is zero it means select any available local port.
165 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
167 struct inet_bind_hashbucket *head;
168 struct hlist_node *node;
169 struct inet_bind_bucket *tb;
174 int low = sysctl_local_port_range[0];
175 int high = sysctl_local_port_range[1];
176 int remaining = (high - low) + 1;
179 spin_lock(&tcp_portalloc_lock);
180 if (tcp_port_rover < low)
183 rover = tcp_port_rover;
188 head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
189 spin_lock(&head->lock);
190 inet_bind_bucket_for_each(tb, node, &head->chain)
191 if (tb->port == rover)
195 spin_unlock(&head->lock);
196 } while (--remaining > 0);
197 tcp_port_rover = rover;
198 spin_unlock(&tcp_portalloc_lock);
200 /* Exhausted local port range during search? It is not
201 * possible for us to be holding one of the bind hash
202 * locks if this test triggers, because if 'remaining'
203 * drops to zero, we broke out of the do/while loop at
204 * the top level, not from the 'break;' statement.
207 if (unlikely(remaining <= 0))
210 /* OK, here is the one we will use. HEAD is
211 * non-NULL and we hold it's mutex.
215 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
216 spin_lock(&head->lock);
217 inet_bind_bucket_for_each(tb, node, &head->chain)
218 if (tb->port == snum)
224 if (!hlist_empty(&tb->owners)) {
225 if (sk->sk_reuse > 1)
227 if (tb->fastreuse > 0 &&
228 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
232 if (tcp_bind_conflict(sk, tb))
238 if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
240 if (hlist_empty(&tb->owners)) {
241 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
245 } else if (tb->fastreuse &&
246 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
249 if (!tcp_sk(sk)->bind_hash)
250 tcp_bind_hash(sk, tb, snum);
251 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
255 spin_unlock(&head->lock);
261 /* Get rid of any references to a local port held by the
264 static void __tcp_put_port(struct sock *sk)
266 struct inet_sock *inet = inet_sk(sk);
267 struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num,
269 struct inet_bind_bucket *tb;
271 spin_lock(&head->lock);
272 tb = tcp_sk(sk)->bind_hash;
273 __sk_del_bind_node(sk);
274 tcp_sk(sk)->bind_hash = NULL;
276 inet_bind_bucket_destroy(tcp_bucket_cachep, tb);
277 spin_unlock(&head->lock);
280 void tcp_put_port(struct sock *sk)
287 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
288 * Look, when several writers sleep and reader wakes them up, all but one
289 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
290 * this, _but_ remember, it adds useless work on UP machines (wake up each
291 * exclusive lock release). It should be ifdefed really.
294 void tcp_listen_wlock(void)
296 write_lock(&tcp_lhash_lock);
298 if (atomic_read(&tcp_lhash_users)) {
302 prepare_to_wait_exclusive(&tcp_lhash_wait,
303 &wait, TASK_UNINTERRUPTIBLE);
304 if (!atomic_read(&tcp_lhash_users))
306 write_unlock_bh(&tcp_lhash_lock);
308 write_lock_bh(&tcp_lhash_lock);
311 finish_wait(&tcp_lhash_wait, &wait);
315 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
317 struct hlist_head *list;
320 BUG_TRAP(sk_unhashed(sk));
321 if (listen_possible && sk->sk_state == TCP_LISTEN) {
322 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
323 lock = &tcp_lhash_lock;
326 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
327 list = &tcp_ehash[sk->sk_hashent].chain;
328 lock = &tcp_ehash[sk->sk_hashent].lock;
331 __sk_add_node(sk, list);
332 sock_prot_inc_use(sk->sk_prot);
334 if (listen_possible && sk->sk_state == TCP_LISTEN)
335 wake_up(&tcp_lhash_wait);
338 static void tcp_v4_hash(struct sock *sk)
340 if (sk->sk_state != TCP_CLOSE) {
342 __tcp_v4_hash(sk, 1);
347 void tcp_unhash(struct sock *sk)
354 if (sk->sk_state == TCP_LISTEN) {
357 lock = &tcp_lhash_lock;
359 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
361 write_lock_bh(&head->lock);
364 if (__sk_del_node_init(sk))
365 sock_prot_dec_use(sk->sk_prot);
366 write_unlock_bh(lock);
369 if (sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
373 /* Don't inline this cruft. Here are some nice properties to
374 * exploit here. The BSD API does not allow a listening TCP
375 * to specify the remote port nor the remote address for the
376 * connection. So always assume those are both wildcarded
377 * during the search since they can never be otherwise.
379 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
381 const unsigned short hnum,
384 struct sock *result = NULL, *sk;
385 struct hlist_node *node;
389 sk_for_each(sk, node, head) {
390 struct inet_sock *inet = inet_sk(sk);
392 if (inet->num == hnum && !ipv6_only_sock(sk)) {
393 __u32 rcv_saddr = inet->rcv_saddr;
395 score = (sk->sk_family == PF_INET ? 1 : 0);
397 if (rcv_saddr != daddr)
401 if (sk->sk_bound_dev_if) {
402 if (sk->sk_bound_dev_if != dif)
408 if (score > hiscore) {
417 /* Optimize the common listener case. */
418 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
419 const unsigned short hnum,
422 struct sock *sk = NULL;
423 struct hlist_head *head;
425 read_lock(&tcp_lhash_lock);
426 head = &tcp_listening_hash[inet_lhashfn(hnum)];
427 if (!hlist_empty(head)) {
428 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
430 if (inet->num == hnum && !sk->sk_node.next &&
431 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
432 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
433 !sk->sk_bound_dev_if)
435 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
441 read_unlock(&tcp_lhash_lock);
445 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
446 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
448 * Local BH must be disabled here.
451 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
457 struct inet_ehash_bucket *head;
458 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
459 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
461 struct hlist_node *node;
462 /* Optimize here for direct hit, only listening connections can
463 * have wildcards anyways.
465 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
466 head = &tcp_ehash[hash];
467 read_lock(&head->lock);
468 sk_for_each(sk, node, &head->chain) {
469 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
470 goto hit; /* You sunk my battleship! */
473 /* Must check for a TIME_WAIT'er before going to listener hash. */
474 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
475 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
480 read_unlock(&head->lock);
487 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
488 u32 daddr, u16 hnum, int dif)
490 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
493 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
496 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
502 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
508 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
510 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
512 return secure_tcp_sequence_number(skb->nh.iph->daddr,
518 /* called with local bh disabled */
519 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
520 struct tcp_tw_bucket **twp)
522 struct inet_sock *inet = inet_sk(sk);
523 u32 daddr = inet->rcv_saddr;
524 u32 saddr = inet->daddr;
525 int dif = sk->sk_bound_dev_if;
526 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
527 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
528 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
529 struct inet_ehash_bucket *head = &tcp_ehash[hash];
531 struct hlist_node *node;
532 struct tcp_tw_bucket *tw;
534 write_lock(&head->lock);
536 /* Check TIME-WAIT sockets first. */
537 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
538 tw = (struct tcp_tw_bucket *)sk2;
540 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
541 struct tcp_sock *tp = tcp_sk(sk);
543 /* With PAWS, it is safe from the viewpoint
544 of data integrity. Even without PAWS it
545 is safe provided sequence spaces do not
546 overlap i.e. at data rates <= 80Mbit/sec.
548 Actually, the idea is close to VJ's one,
549 only timestamp cache is held not per host,
550 but per port pair and TW bucket is used
553 If TW bucket has been already destroyed we
554 fall back to VJ's scheme and use initial
555 timestamp retrieved from peer table.
557 if (tw->tw_ts_recent_stamp &&
558 (!twp || (sysctl_tcp_tw_reuse &&
560 tw->tw_ts_recent_stamp > 1))) {
562 tw->tw_snd_nxt + 65535 + 2) == 0)
564 tp->rx_opt.ts_recent = tw->tw_ts_recent;
565 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
574 /* And established part... */
575 sk_for_each(sk2, node, &head->chain) {
576 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
581 /* Must record num and sport now. Otherwise we will see
582 * in hash table socket with a funny identity. */
584 inet->sport = htons(lport);
585 sk->sk_hashent = hash;
586 BUG_TRAP(sk_unhashed(sk));
587 __sk_add_node(sk, &head->chain);
588 sock_prot_inc_use(sk->sk_prot);
589 write_unlock(&head->lock);
593 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
595 /* Silly. Should hash-dance instead... */
596 tcp_tw_deschedule(tw);
597 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
605 write_unlock(&head->lock);
606 return -EADDRNOTAVAIL;
609 static inline u32 connect_port_offset(const struct sock *sk)
611 const struct inet_sock *inet = inet_sk(sk);
613 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
618 * Bind a port for a connect operation and hash it.
620 static inline int tcp_v4_hash_connect(struct sock *sk)
622 const unsigned short snum = inet_sk(sk)->num;
623 struct inet_bind_hashbucket *head;
624 struct inet_bind_bucket *tb;
628 int low = sysctl_local_port_range[0];
629 int high = sysctl_local_port_range[1];
630 int range = high - low;
634 u32 offset = hint + connect_port_offset(sk);
635 struct hlist_node *node;
636 struct tcp_tw_bucket *tw = NULL;
639 for (i = 1; i <= range; i++) {
640 port = low + (i + offset) % range;
641 head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
642 spin_lock(&head->lock);
644 /* Does not bother with rcv_saddr checks,
645 * because the established check is already
648 inet_bind_bucket_for_each(tb, node, &head->chain) {
649 if (tb->port == port) {
650 BUG_TRAP(!hlist_empty(&tb->owners));
651 if (tb->fastreuse >= 0)
653 if (!__tcp_v4_check_established(sk,
661 tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
663 spin_unlock(&head->lock);
670 spin_unlock(&head->lock);
674 return -EADDRNOTAVAIL;
679 /* Head lock still held and bh's disabled */
680 tcp_bind_hash(sk, tb, port);
681 if (sk_unhashed(sk)) {
682 inet_sk(sk)->sport = htons(port);
683 __tcp_v4_hash(sk, 0);
685 spin_unlock(&head->lock);
688 tcp_tw_deschedule(tw);
696 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
697 tb = tcp_sk(sk)->bind_hash;
698 spin_lock_bh(&head->lock);
699 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
700 __tcp_v4_hash(sk, 0);
701 spin_unlock_bh(&head->lock);
704 spin_unlock(&head->lock);
705 /* No definite answer... Walk to established hash table */
706 ret = __tcp_v4_check_established(sk, snum, NULL);
713 /* This will initiate an outgoing connection. */
714 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
716 struct inet_sock *inet = inet_sk(sk);
717 struct tcp_sock *tp = tcp_sk(sk);
718 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
724 if (addr_len < sizeof(struct sockaddr_in))
727 if (usin->sin_family != AF_INET)
728 return -EAFNOSUPPORT;
730 nexthop = daddr = usin->sin_addr.s_addr;
731 if (inet->opt && inet->opt->srr) {
734 nexthop = inet->opt->faddr;
737 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
738 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
740 inet->sport, usin->sin_port, sk);
744 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
749 if (!inet->opt || !inet->opt->srr)
753 inet->saddr = rt->rt_src;
754 inet->rcv_saddr = inet->saddr;
756 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
757 /* Reset inherited state */
758 tp->rx_opt.ts_recent = 0;
759 tp->rx_opt.ts_recent_stamp = 0;
763 if (sysctl_tcp_tw_recycle &&
764 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
765 struct inet_peer *peer = rt_get_peer(rt);
767 /* VJ's idea. We save last timestamp seen from
768 * the destination in peer table, when entering state TIME-WAIT
769 * and initialize rx_opt.ts_recent from it, when trying new connection.
772 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
773 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
774 tp->rx_opt.ts_recent = peer->tcp_ts;
778 inet->dport = usin->sin_port;
781 tp->ext_header_len = 0;
783 tp->ext_header_len = inet->opt->optlen;
785 tp->rx_opt.mss_clamp = 536;
787 /* Socket identity is still unknown (sport may be zero).
788 * However we set state to SYN-SENT and not releasing socket
789 * lock select source port, enter ourselves into the hash tables and
790 * complete initialization after this.
792 tcp_set_state(sk, TCP_SYN_SENT);
793 err = tcp_v4_hash_connect(sk);
797 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
801 /* OK, now commit destination to socket. */
802 sk_setup_caps(sk, &rt->u.dst);
805 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
810 inet->id = tp->write_seq ^ jiffies;
812 err = tcp_connect(sk);
820 /* This unhashes the socket and releases the local port, if necessary. */
821 tcp_set_state(sk, TCP_CLOSE);
823 sk->sk_route_caps = 0;
828 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
830 return ((struct rtable *)skb->dst)->rt_iif;
833 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
835 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
838 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
839 struct request_sock ***prevp,
841 __u32 raddr, __u32 laddr)
843 struct listen_sock *lopt = tp->accept_queue.listen_opt;
844 struct request_sock *req, **prev;
846 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
847 (req = *prev) != NULL;
848 prev = &req->dl_next) {
849 const struct inet_request_sock *ireq = inet_rsk(req);
851 if (ireq->rmt_port == rport &&
852 ireq->rmt_addr == raddr &&
853 ireq->loc_addr == laddr &&
854 TCP_INET_FAMILY(req->rsk_ops->family)) {
864 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
866 struct tcp_sock *tp = tcp_sk(sk);
867 struct listen_sock *lopt = tp->accept_queue.listen_opt;
868 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
870 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
876 * This routine does path mtu discovery as defined in RFC1191.
878 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
881 struct dst_entry *dst;
882 struct inet_sock *inet = inet_sk(sk);
883 struct tcp_sock *tp = tcp_sk(sk);
885 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
886 * send out by Linux are always <576bytes so they should go through
889 if (sk->sk_state == TCP_LISTEN)
892 /* We don't check in the destentry if pmtu discovery is forbidden
893 * on this route. We just assume that no packet_to_big packets
894 * are send back when pmtu discovery is not active.
895 * There is a small race when the user changes this flag in the
896 * route, but I think that's acceptable.
898 if ((dst = __sk_dst_check(sk, 0)) == NULL)
901 dst->ops->update_pmtu(dst, mtu);
903 /* Something is about to be wrong... Remember soft error
904 * for the case, if this connection will not able to recover.
906 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
907 sk->sk_err_soft = EMSGSIZE;
911 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
912 tp->pmtu_cookie > mtu) {
913 tcp_sync_mss(sk, mtu);
915 /* Resend the TCP packet because it's
916 * clear that the old packet has been
917 * dropped. This is the new "fast" path mtu
920 tcp_simple_retransmit(sk);
921 } /* else let the usual retransmit timer handle it */
925 * This routine is called by the ICMP module when it gets some
926 * sort of error condition. If err < 0 then the socket should
927 * be closed and the error returned to the user. If err > 0
928 * it's just the icmp type << 8 | icmp code. After adjustment
929 * header points to the first 8 bytes of the tcp header. We need
930 * to find the appropriate port.
932 * The locking strategy used here is very "optimistic". When
933 * someone else accesses the socket the ICMP is just dropped
934 * and for some paths there is no check at all.
935 * A more general error queue to queue errors for later handling
936 * is probably better.
940 void tcp_v4_err(struct sk_buff *skb, u32 info)
942 struct iphdr *iph = (struct iphdr *)skb->data;
943 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
945 struct inet_sock *inet;
946 int type = skb->h.icmph->type;
947 int code = skb->h.icmph->code;
952 if (skb->len < (iph->ihl << 2) + 8) {
953 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
957 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
958 th->source, tcp_v4_iif(skb));
960 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
963 if (sk->sk_state == TCP_TIME_WAIT) {
964 tcp_tw_put((struct tcp_tw_bucket *)sk);
969 /* If too many ICMPs get dropped on busy
970 * servers this needs to be solved differently.
972 if (sock_owned_by_user(sk))
973 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
975 if (sk->sk_state == TCP_CLOSE)
979 seq = ntohl(th->seq);
980 if (sk->sk_state != TCP_LISTEN &&
981 !between(seq, tp->snd_una, tp->snd_nxt)) {
982 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
987 case ICMP_SOURCE_QUENCH:
988 /* Just silently ignore these. */
990 case ICMP_PARAMETERPROB:
993 case ICMP_DEST_UNREACH:
994 if (code > NR_ICMP_UNREACH)
997 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
998 if (!sock_owned_by_user(sk))
999 do_pmtu_discovery(sk, iph, info);
1003 err = icmp_err_convert[code].errno;
1005 case ICMP_TIME_EXCEEDED:
1012 switch (sk->sk_state) {
1013 struct request_sock *req, **prev;
1015 if (sock_owned_by_user(sk))
1018 req = tcp_v4_search_req(tp, &prev, th->dest,
1019 iph->daddr, iph->saddr);
1023 /* ICMPs are not backlogged, hence we cannot get
1024 an established socket here.
1028 if (seq != tcp_rsk(req)->snt_isn) {
1029 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1034 * Still in SYN_RECV, just remove it silently.
1035 * There is no good way to pass the error to the newly
1036 * created socket, and POSIX does not want network
1037 * errors returned from accept().
1039 tcp_synq_drop(sk, req, prev);
1043 case TCP_SYN_RECV: /* Cannot happen.
1044 It can f.e. if SYNs crossed.
1046 if (!sock_owned_by_user(sk)) {
1047 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1050 sk->sk_error_report(sk);
1054 sk->sk_err_soft = err;
1059 /* If we've already connected we will keep trying
1060 * until we time out, or the user gives up.
1062 * rfc1122 4.2.3.9 allows to consider as hard errors
1063 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1064 * but it is obsoleted by pmtu discovery).
1066 * Note, that in modern internet, where routing is unreliable
1067 * and in each dark corner broken firewalls sit, sending random
1068 * errors ordered by their masters even this two messages finally lose
1069 * their original sense (even Linux sends invalid PORT_UNREACHs)
1071 * Now we are in compliance with RFCs.
1076 if (!sock_owned_by_user(sk) && inet->recverr) {
1078 sk->sk_error_report(sk);
1079 } else { /* Only an error on timeout */
1080 sk->sk_err_soft = err;
1088 /* This routine computes an IPv4 TCP checksum. */
1089 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1090 struct sk_buff *skb)
1092 struct inet_sock *inet = inet_sk(sk);
1094 if (skb->ip_summed == CHECKSUM_HW) {
1095 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1096 skb->csum = offsetof(struct tcphdr, check);
1098 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1099 csum_partial((char *)th,
1106 * This routine will send an RST to the other tcp.
1108 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1110 * Answer: if a packet caused RST, it is not for a socket
1111 * existing in our system, if it is matched to a socket,
1112 * it is just duplicate segment or bug in other side's TCP.
1113 * So that we build reply only basing on parameters
1114 * arrived with segment.
1115 * Exception: precedence violation. We do not implement it in any case.
1118 static void tcp_v4_send_reset(struct sk_buff *skb)
1120 struct tcphdr *th = skb->h.th;
1122 struct ip_reply_arg arg;
1124 /* Never send a reset in response to a reset. */
1128 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1131 /* Swap the send and the receive. */
1132 memset(&rth, 0, sizeof(struct tcphdr));
1133 rth.dest = th->source;
1134 rth.source = th->dest;
1135 rth.doff = sizeof(struct tcphdr) / 4;
1139 rth.seq = th->ack_seq;
1142 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1143 skb->len - (th->doff << 2));
1146 memset(&arg, 0, sizeof arg);
1147 arg.iov[0].iov_base = (unsigned char *)&rth;
1148 arg.iov[0].iov_len = sizeof rth;
1149 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1150 skb->nh.iph->saddr, /*XXX*/
1151 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1152 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1154 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1156 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1157 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1160 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1161 outside socket context is ugly, certainly. What can I do?
1164 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1167 struct tcphdr *th = skb->h.th;
1172 struct ip_reply_arg arg;
1174 memset(&rep.th, 0, sizeof(struct tcphdr));
1175 memset(&arg, 0, sizeof arg);
1177 arg.iov[0].iov_base = (unsigned char *)&rep;
1178 arg.iov[0].iov_len = sizeof(rep.th);
1180 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1181 (TCPOPT_TIMESTAMP << 8) |
1183 rep.tsopt[1] = htonl(tcp_time_stamp);
1184 rep.tsopt[2] = htonl(ts);
1185 arg.iov[0].iov_len = sizeof(rep);
1188 /* Swap the send and the receive. */
1189 rep.th.dest = th->source;
1190 rep.th.source = th->dest;
1191 rep.th.doff = arg.iov[0].iov_len / 4;
1192 rep.th.seq = htonl(seq);
1193 rep.th.ack_seq = htonl(ack);
1195 rep.th.window = htons(win);
1197 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1198 skb->nh.iph->saddr, /*XXX*/
1199 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1200 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1202 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1204 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1207 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1209 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1211 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1212 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1217 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1219 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1223 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1224 struct request_sock *req)
1227 const struct inet_request_sock *ireq = inet_rsk(req);
1228 struct ip_options *opt = inet_rsk(req)->opt;
1229 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1231 { .daddr = ((opt && opt->srr) ?
1234 .saddr = ireq->loc_addr,
1235 .tos = RT_CONN_FLAGS(sk) } },
1236 .proto = IPPROTO_TCP,
1238 { .sport = inet_sk(sk)->sport,
1239 .dport = ireq->rmt_port } } };
1241 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1242 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1245 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1247 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1254 * Send a SYN-ACK after having received an ACK.
1255 * This still operates on a request_sock only, not on a big
1258 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1259 struct dst_entry *dst)
1261 const struct inet_request_sock *ireq = inet_rsk(req);
1263 struct sk_buff * skb;
1265 /* First, grab a route. */
1266 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1269 skb = tcp_make_synack(sk, dst, req);
1272 struct tcphdr *th = skb->h.th;
1274 th->check = tcp_v4_check(th, skb->len,
1277 csum_partial((char *)th, skb->len,
1280 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1283 if (err == NET_XMIT_CN)
1293 * IPv4 request_sock destructor.
1295 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1297 if (inet_rsk(req)->opt)
1298 kfree(inet_rsk(req)->opt);
1301 static inline void syn_flood_warning(struct sk_buff *skb)
1303 static unsigned long warntime;
1305 if (time_after(jiffies, (warntime + HZ * 60))) {
1308 "possible SYN flooding on port %d. Sending cookies.\n",
1309 ntohs(skb->h.th->dest));
1314 * Save and compile IPv4 options into the request_sock if needed.
1316 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1317 struct sk_buff *skb)
1319 struct ip_options *opt = &(IPCB(skb)->opt);
1320 struct ip_options *dopt = NULL;
1322 if (opt && opt->optlen) {
1323 int opt_size = optlength(opt);
1324 dopt = kmalloc(opt_size, GFP_ATOMIC);
1326 if (ip_options_echo(dopt, skb)) {
1335 struct request_sock_ops tcp_request_sock_ops = {
1337 .obj_size = sizeof(struct tcp_request_sock),
1338 .rtx_syn_ack = tcp_v4_send_synack,
1339 .send_ack = tcp_v4_reqsk_send_ack,
1340 .destructor = tcp_v4_reqsk_destructor,
1341 .send_reset = tcp_v4_send_reset,
1344 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1346 struct inet_request_sock *ireq;
1347 struct tcp_options_received tmp_opt;
1348 struct request_sock *req;
1349 __u32 saddr = skb->nh.iph->saddr;
1350 __u32 daddr = skb->nh.iph->daddr;
1351 __u32 isn = TCP_SKB_CB(skb)->when;
1352 struct dst_entry *dst = NULL;
1353 #ifdef CONFIG_SYN_COOKIES
1354 int want_cookie = 0;
1356 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1359 /* Never answer to SYNs send to broadcast or multicast */
1360 if (((struct rtable *)skb->dst)->rt_flags &
1361 (RTCF_BROADCAST | RTCF_MULTICAST))
1364 /* TW buckets are converted to open requests without
1365 * limitations, they conserve resources and peer is
1366 * evidently real one.
1368 if (tcp_synq_is_full(sk) && !isn) {
1369 #ifdef CONFIG_SYN_COOKIES
1370 if (sysctl_tcp_syncookies) {
1377 /* Accept backlog is full. If we have already queued enough
1378 * of warm entries in syn queue, drop request. It is better than
1379 * clogging syn queue with openreqs with exponentially increasing
1382 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1385 req = reqsk_alloc(&tcp_request_sock_ops);
1389 tcp_clear_options(&tmp_opt);
1390 tmp_opt.mss_clamp = 536;
1391 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1393 tcp_parse_options(skb, &tmp_opt, 0);
1396 tcp_clear_options(&tmp_opt);
1397 tmp_opt.saw_tstamp = 0;
1400 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1401 /* Some OSes (unknown ones, but I see them on web server, which
1402 * contains information interesting only for windows'
1403 * users) do not send their stamp in SYN. It is easy case.
1404 * We simply do not advertise TS support.
1406 tmp_opt.saw_tstamp = 0;
1407 tmp_opt.tstamp_ok = 0;
1409 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1411 tcp_openreq_init(req, &tmp_opt, skb);
1413 ireq = inet_rsk(req);
1414 ireq->loc_addr = daddr;
1415 ireq->rmt_addr = saddr;
1416 ireq->opt = tcp_v4_save_options(sk, skb);
1418 TCP_ECN_create_request(req, skb->h.th);
1421 #ifdef CONFIG_SYN_COOKIES
1422 syn_flood_warning(skb);
1424 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1426 struct inet_peer *peer = NULL;
1428 /* VJ's idea. We save last timestamp seen
1429 * from the destination in peer table, when entering
1430 * state TIME-WAIT, and check against it before
1431 * accepting new connection request.
1433 * If "isn" is not zero, this request hit alive
1434 * timewait bucket, so that all the necessary checks
1435 * are made in the function processing timewait state.
1437 if (tmp_opt.saw_tstamp &&
1438 sysctl_tcp_tw_recycle &&
1439 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1440 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1441 peer->v4daddr == saddr) {
1442 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1443 (s32)(peer->tcp_ts - req->ts_recent) >
1445 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1450 /* Kill the following clause, if you dislike this way. */
1451 else if (!sysctl_tcp_syncookies &&
1452 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1453 (sysctl_max_syn_backlog >> 2)) &&
1454 (!peer || !peer->tcp_ts_stamp) &&
1455 (!dst || !dst_metric(dst, RTAX_RTT))) {
1456 /* Without syncookies last quarter of
1457 * backlog is filled with destinations,
1458 * proven to be alive.
1459 * It means that we continue to communicate
1460 * to destinations, already remembered
1461 * to the moment of synflood.
1463 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1464 "request from %u.%u."
1467 ntohs(skb->h.th->source)));
1472 isn = tcp_v4_init_sequence(sk, skb);
1474 tcp_rsk(req)->snt_isn = isn;
1476 if (tcp_v4_send_synack(sk, req, dst))
1482 tcp_v4_synq_add(sk, req);
1489 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1495 * The three way handshake has completed - we got a valid synack -
1496 * now create the new socket.
1498 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1499 struct request_sock *req,
1500 struct dst_entry *dst)
1502 struct inet_request_sock *ireq;
1503 struct inet_sock *newinet;
1504 struct tcp_sock *newtp;
1507 if (sk_acceptq_is_full(sk))
1510 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1513 newsk = tcp_create_openreq_child(sk, req, skb);
1517 sk_setup_caps(newsk, dst);
1519 newtp = tcp_sk(newsk);
1520 newinet = inet_sk(newsk);
1521 ireq = inet_rsk(req);
1522 newinet->daddr = ireq->rmt_addr;
1523 newinet->rcv_saddr = ireq->loc_addr;
1524 newinet->saddr = ireq->loc_addr;
1525 newinet->opt = ireq->opt;
1527 newinet->mc_index = tcp_v4_iif(skb);
1528 newinet->mc_ttl = skb->nh.iph->ttl;
1529 newtp->ext_header_len = 0;
1531 newtp->ext_header_len = newinet->opt->optlen;
1532 newinet->id = newtp->write_seq ^ jiffies;
1534 tcp_sync_mss(newsk, dst_mtu(dst));
1535 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1536 tcp_initialize_rcv_mss(newsk);
1538 __tcp_v4_hash(newsk, 0);
1539 __tcp_inherit_port(sk, newsk);
1544 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1546 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1551 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1553 struct tcphdr *th = skb->h.th;
1554 struct iphdr *iph = skb->nh.iph;
1555 struct tcp_sock *tp = tcp_sk(sk);
1557 struct request_sock **prev;
1558 /* Find possible connection requests. */
1559 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1560 iph->saddr, iph->daddr);
1562 return tcp_check_req(sk, skb, req, prev);
1564 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1571 if (nsk->sk_state != TCP_TIME_WAIT) {
1575 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1579 #ifdef CONFIG_SYN_COOKIES
1580 if (!th->rst && !th->syn && th->ack)
1581 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1586 static int tcp_v4_checksum_init(struct sk_buff *skb)
1588 if (skb->ip_summed == CHECKSUM_HW) {
1589 skb->ip_summed = CHECKSUM_UNNECESSARY;
1590 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1591 skb->nh.iph->daddr, skb->csum))
1594 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1595 skb->ip_summed = CHECKSUM_NONE;
1597 if (skb->len <= 76) {
1598 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1600 skb_checksum(skb, 0, skb->len, 0)))
1602 skb->ip_summed = CHECKSUM_UNNECESSARY;
1604 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1606 skb->nh.iph->daddr, 0);
1612 /* The socket must have it's spinlock held when we get
1615 * We have a potential double-lock case here, so even when
1616 * doing backlog processing we use the BH locking scheme.
1617 * This is because we cannot sleep with the original spinlock
1620 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1622 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1623 TCP_CHECK_TIMER(sk);
1624 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1626 TCP_CHECK_TIMER(sk);
1630 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1633 if (sk->sk_state == TCP_LISTEN) {
1634 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1639 if (tcp_child_process(sk, nsk, skb))
1645 TCP_CHECK_TIMER(sk);
1646 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1648 TCP_CHECK_TIMER(sk);
1652 tcp_v4_send_reset(skb);
1655 /* Be careful here. If this function gets more complicated and
1656 * gcc suffers from register pressure on the x86, sk (in %ebx)
1657 * might be destroyed here. This current version compiles correctly,
1658 * but you have been warned.
1663 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1671 int tcp_v4_rcv(struct sk_buff *skb)
1677 if (skb->pkt_type != PACKET_HOST)
1680 /* Count it even if it's bad */
1681 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1683 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1688 if (th->doff < sizeof(struct tcphdr) / 4)
1690 if (!pskb_may_pull(skb, th->doff * 4))
1693 /* An explanation is required here, I think.
1694 * Packet length and doff are validated by header prediction,
1695 * provided case of th->doff==0 is elimineted.
1696 * So, we defer the checks. */
1697 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1698 tcp_v4_checksum_init(skb) < 0))
1702 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1703 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1704 skb->len - th->doff * 4);
1705 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1706 TCP_SKB_CB(skb)->when = 0;
1707 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1708 TCP_SKB_CB(skb)->sacked = 0;
1710 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1711 skb->nh.iph->daddr, ntohs(th->dest),
1718 if (sk->sk_state == TCP_TIME_WAIT)
1721 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1722 goto discard_and_relse;
1724 if (sk_filter(sk, skb, 0))
1725 goto discard_and_relse;
1731 if (!sock_owned_by_user(sk)) {
1732 if (!tcp_prequeue(sk, skb))
1733 ret = tcp_v4_do_rcv(sk, skb);
1735 sk_add_backlog(sk, skb);
1743 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1746 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1748 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1750 tcp_v4_send_reset(skb);
1754 /* Discard frame. */
1763 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1764 tcp_tw_put((struct tcp_tw_bucket *) sk);
1768 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1769 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1770 tcp_tw_put((struct tcp_tw_bucket *) sk);
1773 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1774 skb, th, skb->len)) {
1776 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1780 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1781 tcp_tw_put((struct tcp_tw_bucket *)sk);
1785 /* Fall through to ACK */
1788 tcp_v4_timewait_ack(sk, skb);
1792 case TCP_TW_SUCCESS:;
1797 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1799 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1800 struct inet_sock *inet = inet_sk(sk);
1802 sin->sin_family = AF_INET;
1803 sin->sin_addr.s_addr = inet->daddr;
1804 sin->sin_port = inet->dport;
1807 /* VJ's idea. Save last timestamp seen from this destination
1808 * and hold it at least for normal timewait interval to use for duplicate
1809 * segment detection in subsequent connections, before they enter synchronized
1813 int tcp_v4_remember_stamp(struct sock *sk)
1815 struct inet_sock *inet = inet_sk(sk);
1816 struct tcp_sock *tp = tcp_sk(sk);
1817 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1818 struct inet_peer *peer = NULL;
1821 if (!rt || rt->rt_dst != inet->daddr) {
1822 peer = inet_getpeer(inet->daddr, 1);
1826 rt_bind_peer(rt, 1);
1831 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1832 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1833 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1834 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1835 peer->tcp_ts = tp->rx_opt.ts_recent;
1845 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1847 struct inet_peer *peer = NULL;
1849 peer = inet_getpeer(tw->tw_daddr, 1);
1852 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1853 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1854 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1855 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1856 peer->tcp_ts = tw->tw_ts_recent;
1865 struct tcp_func ipv4_specific = {
1866 .queue_xmit = ip_queue_xmit,
1867 .send_check = tcp_v4_send_check,
1868 .rebuild_header = inet_sk_rebuild_header,
1869 .conn_request = tcp_v4_conn_request,
1870 .syn_recv_sock = tcp_v4_syn_recv_sock,
1871 .remember_stamp = tcp_v4_remember_stamp,
1872 .net_header_len = sizeof(struct iphdr),
1873 .setsockopt = ip_setsockopt,
1874 .getsockopt = ip_getsockopt,
1875 .addr2sockaddr = v4_addr2sockaddr,
1876 .sockaddr_len = sizeof(struct sockaddr_in),
1879 /* NOTE: A lot of things set to zero explicitly by call to
1880 * sk_alloc() so need not be done here.
1882 static int tcp_v4_init_sock(struct sock *sk)
1884 struct tcp_sock *tp = tcp_sk(sk);
1886 skb_queue_head_init(&tp->out_of_order_queue);
1887 tcp_init_xmit_timers(sk);
1888 tcp_prequeue_init(tp);
1890 tp->rto = TCP_TIMEOUT_INIT;
1891 tp->mdev = TCP_TIMEOUT_INIT;
1893 /* So many TCP implementations out there (incorrectly) count the
1894 * initial SYN frame in their delayed-ACK and congestion control
1895 * algorithms that we must have the following bandaid to talk
1896 * efficiently to them. -DaveM
1900 /* See draft-stevens-tcpca-spec-01 for discussion of the
1901 * initialization of these values.
1903 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1904 tp->snd_cwnd_clamp = ~0;
1905 tp->mss_cache = 536;
1907 tp->reordering = sysctl_tcp_reordering;
1908 tp->ca_ops = &tcp_init_congestion_ops;
1910 sk->sk_state = TCP_CLOSE;
1912 sk->sk_write_space = sk_stream_write_space;
1913 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1915 tp->af_specific = &ipv4_specific;
1917 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1918 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1920 atomic_inc(&tcp_sockets_allocated);
1925 int tcp_v4_destroy_sock(struct sock *sk)
1927 struct tcp_sock *tp = tcp_sk(sk);
1929 tcp_clear_xmit_timers(sk);
1931 tcp_cleanup_congestion_control(tp);
1933 /* Cleanup up the write buffer. */
1934 sk_stream_writequeue_purge(sk);
1936 /* Cleans up our, hopefully empty, out_of_order_queue. */
1937 __skb_queue_purge(&tp->out_of_order_queue);
1939 /* Clean prequeue, it must be empty really */
1940 __skb_queue_purge(&tp->ucopy.prequeue);
1942 /* Clean up a referenced TCP bind bucket. */
1947 * If sendmsg cached page exists, toss it.
1949 if (sk->sk_sndmsg_page) {
1950 __free_page(sk->sk_sndmsg_page);
1951 sk->sk_sndmsg_page = NULL;
1954 atomic_dec(&tcp_sockets_allocated);
1959 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1961 #ifdef CONFIG_PROC_FS
1962 /* Proc filesystem TCP sock list dumping. */
1964 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1966 return hlist_empty(head) ? NULL :
1967 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1970 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1972 return tw->tw_node.next ?
1973 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1976 static void *listening_get_next(struct seq_file *seq, void *cur)
1978 struct tcp_sock *tp;
1979 struct hlist_node *node;
1980 struct sock *sk = cur;
1981 struct tcp_iter_state* st = seq->private;
1985 sk = sk_head(&tcp_listening_hash[0]);
1991 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1992 struct request_sock *req = cur;
1994 tp = tcp_sk(st->syn_wait_sk);
1998 if (req->rsk_ops->family == st->family) {
2004 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2007 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2009 sk = sk_next(st->syn_wait_sk);
2010 st->state = TCP_SEQ_STATE_LISTENING;
2011 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2014 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2015 if (reqsk_queue_len(&tp->accept_queue))
2017 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2021 sk_for_each_from(sk, node) {
2022 if (sk->sk_family == st->family) {
2027 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2028 if (reqsk_queue_len(&tp->accept_queue)) {
2030 st->uid = sock_i_uid(sk);
2031 st->syn_wait_sk = sk;
2032 st->state = TCP_SEQ_STATE_OPENREQ;
2036 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2038 if (++st->bucket < INET_LHTABLE_SIZE) {
2039 sk = sk_head(&tcp_listening_hash[st->bucket]);
2047 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2049 void *rc = listening_get_next(seq, NULL);
2051 while (rc && *pos) {
2052 rc = listening_get_next(seq, rc);
2058 static void *established_get_first(struct seq_file *seq)
2060 struct tcp_iter_state* st = seq->private;
2063 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2065 struct hlist_node *node;
2066 struct tcp_tw_bucket *tw;
2068 /* We can reschedule _before_ having picked the target: */
2069 cond_resched_softirq();
2071 read_lock(&tcp_ehash[st->bucket].lock);
2072 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2073 if (sk->sk_family != st->family) {
2079 st->state = TCP_SEQ_STATE_TIME_WAIT;
2080 tw_for_each(tw, node,
2081 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2082 if (tw->tw_family != st->family) {
2088 read_unlock(&tcp_ehash[st->bucket].lock);
2089 st->state = TCP_SEQ_STATE_ESTABLISHED;
2095 static void *established_get_next(struct seq_file *seq, void *cur)
2097 struct sock *sk = cur;
2098 struct tcp_tw_bucket *tw;
2099 struct hlist_node *node;
2100 struct tcp_iter_state* st = seq->private;
2104 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2108 while (tw && tw->tw_family != st->family) {
2115 read_unlock(&tcp_ehash[st->bucket].lock);
2116 st->state = TCP_SEQ_STATE_ESTABLISHED;
2118 /* We can reschedule between buckets: */
2119 cond_resched_softirq();
2121 if (++st->bucket < tcp_ehash_size) {
2122 read_lock(&tcp_ehash[st->bucket].lock);
2123 sk = sk_head(&tcp_ehash[st->bucket].chain);
2131 sk_for_each_from(sk, node) {
2132 if (sk->sk_family == st->family)
2136 st->state = TCP_SEQ_STATE_TIME_WAIT;
2137 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2145 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2147 void *rc = established_get_first(seq);
2150 rc = established_get_next(seq, rc);
2156 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2159 struct tcp_iter_state* st = seq->private;
2162 st->state = TCP_SEQ_STATE_LISTENING;
2163 rc = listening_get_idx(seq, &pos);
2166 tcp_listen_unlock();
2168 st->state = TCP_SEQ_STATE_ESTABLISHED;
2169 rc = established_get_idx(seq, pos);
2175 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2177 struct tcp_iter_state* st = seq->private;
2178 st->state = TCP_SEQ_STATE_LISTENING;
2180 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2183 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2186 struct tcp_iter_state* st;
2188 if (v == SEQ_START_TOKEN) {
2189 rc = tcp_get_idx(seq, 0);
2194 switch (st->state) {
2195 case TCP_SEQ_STATE_OPENREQ:
2196 case TCP_SEQ_STATE_LISTENING:
2197 rc = listening_get_next(seq, v);
2199 tcp_listen_unlock();
2201 st->state = TCP_SEQ_STATE_ESTABLISHED;
2202 rc = established_get_first(seq);
2205 case TCP_SEQ_STATE_ESTABLISHED:
2206 case TCP_SEQ_STATE_TIME_WAIT:
2207 rc = established_get_next(seq, v);
2215 static void tcp_seq_stop(struct seq_file *seq, void *v)
2217 struct tcp_iter_state* st = seq->private;
2219 switch (st->state) {
2220 case TCP_SEQ_STATE_OPENREQ:
2222 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2223 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2225 case TCP_SEQ_STATE_LISTENING:
2226 if (v != SEQ_START_TOKEN)
2227 tcp_listen_unlock();
2229 case TCP_SEQ_STATE_TIME_WAIT:
2230 case TCP_SEQ_STATE_ESTABLISHED:
2232 read_unlock(&tcp_ehash[st->bucket].lock);
2238 static int tcp_seq_open(struct inode *inode, struct file *file)
2240 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2241 struct seq_file *seq;
2242 struct tcp_iter_state *s;
2245 if (unlikely(afinfo == NULL))
2248 s = kmalloc(sizeof(*s), GFP_KERNEL);
2251 memset(s, 0, sizeof(*s));
2252 s->family = afinfo->family;
2253 s->seq_ops.start = tcp_seq_start;
2254 s->seq_ops.next = tcp_seq_next;
2255 s->seq_ops.show = afinfo->seq_show;
2256 s->seq_ops.stop = tcp_seq_stop;
2258 rc = seq_open(file, &s->seq_ops);
2261 seq = file->private_data;
2270 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2273 struct proc_dir_entry *p;
2277 afinfo->seq_fops->owner = afinfo->owner;
2278 afinfo->seq_fops->open = tcp_seq_open;
2279 afinfo->seq_fops->read = seq_read;
2280 afinfo->seq_fops->llseek = seq_lseek;
2281 afinfo->seq_fops->release = seq_release_private;
2283 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2291 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2295 proc_net_remove(afinfo->name);
2296 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2299 static void get_openreq4(struct sock *sk, struct request_sock *req,
2300 char *tmpbuf, int i, int uid)
2302 const struct inet_request_sock *ireq = inet_rsk(req);
2303 int ttd = req->expires - jiffies;
2305 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2306 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2309 ntohs(inet_sk(sk)->sport),
2311 ntohs(ireq->rmt_port),
2313 0, 0, /* could print option size, but that is af dependent. */
2314 1, /* timers active (only the expire timer) */
2315 jiffies_to_clock_t(ttd),
2318 0, /* non standard timer */
2319 0, /* open_requests have no inode */
2320 atomic_read(&sk->sk_refcnt),
2324 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2327 unsigned long timer_expires;
2328 struct tcp_sock *tp = tcp_sk(sp);
2329 struct inet_sock *inet = inet_sk(sp);
2330 unsigned int dest = inet->daddr;
2331 unsigned int src = inet->rcv_saddr;
2332 __u16 destp = ntohs(inet->dport);
2333 __u16 srcp = ntohs(inet->sport);
2335 if (tp->pending == TCP_TIME_RETRANS) {
2337 timer_expires = tp->timeout;
2338 } else if (tp->pending == TCP_TIME_PROBE0) {
2340 timer_expires = tp->timeout;
2341 } else if (timer_pending(&sp->sk_timer)) {
2343 timer_expires = sp->sk_timer.expires;
2346 timer_expires = jiffies;
2349 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2350 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2351 i, src, srcp, dest, destp, sp->sk_state,
2352 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2354 jiffies_to_clock_t(timer_expires - jiffies),
2359 atomic_read(&sp->sk_refcnt), sp,
2360 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2362 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2365 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2367 unsigned int dest, src;
2369 int ttd = tw->tw_ttd - jiffies;
2374 dest = tw->tw_daddr;
2375 src = tw->tw_rcv_saddr;
2376 destp = ntohs(tw->tw_dport);
2377 srcp = ntohs(tw->tw_sport);
2379 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2380 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2381 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2382 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2383 atomic_read(&tw->tw_refcnt), tw);
2388 static int tcp4_seq_show(struct seq_file *seq, void *v)
2390 struct tcp_iter_state* st;
2391 char tmpbuf[TMPSZ + 1];
2393 if (v == SEQ_START_TOKEN) {
2394 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2395 " sl local_address rem_address st tx_queue "
2396 "rx_queue tr tm->when retrnsmt uid timeout "
2402 switch (st->state) {
2403 case TCP_SEQ_STATE_LISTENING:
2404 case TCP_SEQ_STATE_ESTABLISHED:
2405 get_tcp4_sock(v, tmpbuf, st->num);
2407 case TCP_SEQ_STATE_OPENREQ:
2408 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2410 case TCP_SEQ_STATE_TIME_WAIT:
2411 get_timewait4_sock(v, tmpbuf, st->num);
2414 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2419 static struct file_operations tcp4_seq_fops;
2420 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2421 .owner = THIS_MODULE,
2424 .seq_show = tcp4_seq_show,
2425 .seq_fops = &tcp4_seq_fops,
2428 int __init tcp4_proc_init(void)
2430 return tcp_proc_register(&tcp4_seq_afinfo);
2433 void tcp4_proc_exit(void)
2435 tcp_proc_unregister(&tcp4_seq_afinfo);
2437 #endif /* CONFIG_PROC_FS */
2439 struct proto tcp_prot = {
2441 .owner = THIS_MODULE,
2443 .connect = tcp_v4_connect,
2444 .disconnect = tcp_disconnect,
2445 .accept = tcp_accept,
2447 .init = tcp_v4_init_sock,
2448 .destroy = tcp_v4_destroy_sock,
2449 .shutdown = tcp_shutdown,
2450 .setsockopt = tcp_setsockopt,
2451 .getsockopt = tcp_getsockopt,
2452 .sendmsg = tcp_sendmsg,
2453 .recvmsg = tcp_recvmsg,
2454 .backlog_rcv = tcp_v4_do_rcv,
2455 .hash = tcp_v4_hash,
2456 .unhash = tcp_unhash,
2457 .get_port = tcp_v4_get_port,
2458 .enter_memory_pressure = tcp_enter_memory_pressure,
2459 .sockets_allocated = &tcp_sockets_allocated,
2460 .memory_allocated = &tcp_memory_allocated,
2461 .memory_pressure = &tcp_memory_pressure,
2462 .sysctl_mem = sysctl_tcp_mem,
2463 .sysctl_wmem = sysctl_tcp_wmem,
2464 .sysctl_rmem = sysctl_tcp_rmem,
2465 .max_header = MAX_TCP_HEADER,
2466 .obj_size = sizeof(struct tcp_sock),
2467 .rsk_prot = &tcp_request_sock_ops,
2472 void __init tcp_v4_init(struct net_proto_family *ops)
2474 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2476 panic("Failed to create the TCP control socket.\n");
2477 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2478 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2480 /* Unhash it so that IP input processing does not even
2481 * see it, we do not wish this socket to see incoming
2484 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2487 EXPORT_SYMBOL(ipv4_specific);
2488 EXPORT_SYMBOL(tcp_bind_hash);
2489 EXPORT_SYMBOL(inet_bind_bucket_create);
2490 EXPORT_SYMBOL(tcp_hashinfo);
2491 EXPORT_SYMBOL(tcp_inherit_port);
2492 EXPORT_SYMBOL(tcp_listen_wlock);
2493 EXPORT_SYMBOL(tcp_port_rover);
2494 EXPORT_SYMBOL(tcp_prot);
2495 EXPORT_SYMBOL(tcp_put_port);
2496 EXPORT_SYMBOL(tcp_unhash);
2497 EXPORT_SYMBOL(tcp_v4_conn_request);
2498 EXPORT_SYMBOL(tcp_v4_connect);
2499 EXPORT_SYMBOL(tcp_v4_do_rcv);
2500 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2501 EXPORT_SYMBOL(tcp_v4_send_check);
2502 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2504 #ifdef CONFIG_PROC_FS
2505 EXPORT_SYMBOL(tcp_proc_register);
2506 EXPORT_SYMBOL(tcp_proc_unregister);
2508 EXPORT_SYMBOL(sysctl_local_port_range);
2509 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2510 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);