[INET]: Move the TCP ehash functions to include/net/inet_hashtables.h
[firefly-linux-kernel-4.4.55.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
94         .__tcp_lhash_users      =       ATOMIC_INIT(0),
95         .__tcp_lhash_wait
96           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
97         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
98 };
99
100 /*
101  * This array holds the first and last local port number.
102  * For high-usage systems, use sysctl to change this to
103  * 32768-61000
104  */
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106 int tcp_port_rover = 1024 - 1;
107
108 /* Allocate and initialize a new TCP local port bind bucket.
109  * The bindhash mutex for snum's hash chain must be held here.
110  */
111 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
112                                           unsigned short snum)
113 {
114         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
115                                                       SLAB_ATOMIC);
116         if (tb) {
117                 tb->port = snum;
118                 tb->fastreuse = 0;
119                 INIT_HLIST_HEAD(&tb->owners);
120                 hlist_add_head(&tb->node, &head->chain);
121         }
122         return tb;
123 }
124
125 /* Caller must hold hashbucket lock for this tb with local BH disabled */
126 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
127 {
128         if (hlist_empty(&tb->owners)) {
129                 __hlist_del(&tb->node);
130                 kmem_cache_free(tcp_bucket_cachep, tb);
131         }
132 }
133
134 /* Caller must disable local BH processing. */
135 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
136 {
137         struct tcp_bind_hashbucket *head =
138                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
139         struct tcp_bind_bucket *tb;
140
141         spin_lock(&head->lock);
142         tb = tcp_sk(sk)->bind_hash;
143         sk_add_bind_node(child, &tb->owners);
144         tcp_sk(child)->bind_hash = tb;
145         spin_unlock(&head->lock);
146 }
147
148 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
149 {
150         local_bh_disable();
151         __tcp_inherit_port(sk, child);
152         local_bh_enable();
153 }
154
155 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
156                    unsigned short snum)
157 {
158         inet_sk(sk)->num = snum;
159         sk_add_bind_node(sk, &tb->owners);
160         tcp_sk(sk)->bind_hash = tb;
161 }
162
163 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
164 {
165         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
166         struct sock *sk2;
167         struct hlist_node *node;
168         int reuse = sk->sk_reuse;
169
170         sk_for_each_bound(sk2, node, &tb->owners) {
171                 if (sk != sk2 &&
172                     !tcp_v6_ipv6only(sk2) &&
173                     (!sk->sk_bound_dev_if ||
174                      !sk2->sk_bound_dev_if ||
175                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
176                         if (!reuse || !sk2->sk_reuse ||
177                             sk2->sk_state == TCP_LISTEN) {
178                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
179                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
180                                     sk2_rcv_saddr == sk_rcv_saddr)
181                                         break;
182                         }
183                 }
184         }
185         return node != NULL;
186 }
187
188 /* Obtain a reference to a local port for the given sock,
189  * if snum is zero it means select any available local port.
190  */
191 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
192 {
193         struct tcp_bind_hashbucket *head;
194         struct hlist_node *node;
195         struct tcp_bind_bucket *tb;
196         int ret;
197
198         local_bh_disable();
199         if (!snum) {
200                 int low = sysctl_local_port_range[0];
201                 int high = sysctl_local_port_range[1];
202                 int remaining = (high - low) + 1;
203                 int rover;
204
205                 spin_lock(&tcp_portalloc_lock);
206                 if (tcp_port_rover < low)
207                         rover = low;
208                 else
209                         rover = tcp_port_rover;
210                 do {
211                         rover++;
212                         if (rover > high)
213                                 rover = low;
214                         head = &tcp_bhash[tcp_bhashfn(rover)];
215                         spin_lock(&head->lock);
216                         tb_for_each(tb, node, &head->chain)
217                                 if (tb->port == rover)
218                                         goto next;
219                         break;
220                 next:
221                         spin_unlock(&head->lock);
222                 } while (--remaining > 0);
223                 tcp_port_rover = rover;
224                 spin_unlock(&tcp_portalloc_lock);
225
226                 /* Exhausted local port range during search?  It is not
227                  * possible for us to be holding one of the bind hash
228                  * locks if this test triggers, because if 'remaining'
229                  * drops to zero, we broke out of the do/while loop at
230                  * the top level, not from the 'break;' statement.
231                  */
232                 ret = 1;
233                 if (unlikely(remaining <= 0))
234                         goto fail;
235
236                 /* OK, here is the one we will use.  HEAD is
237                  * non-NULL and we hold it's mutex.
238                  */
239                 snum = rover;
240         } else {
241                 head = &tcp_bhash[tcp_bhashfn(snum)];
242                 spin_lock(&head->lock);
243                 tb_for_each(tb, node, &head->chain)
244                         if (tb->port == snum)
245                                 goto tb_found;
246         }
247         tb = NULL;
248         goto tb_not_found;
249 tb_found:
250         if (!hlist_empty(&tb->owners)) {
251                 if (sk->sk_reuse > 1)
252                         goto success;
253                 if (tb->fastreuse > 0 &&
254                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
255                         goto success;
256                 } else {
257                         ret = 1;
258                         if (tcp_bind_conflict(sk, tb))
259                                 goto fail_unlock;
260                 }
261         }
262 tb_not_found:
263         ret = 1;
264         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
265                 goto fail_unlock;
266         if (hlist_empty(&tb->owners)) {
267                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
268                         tb->fastreuse = 1;
269                 else
270                         tb->fastreuse = 0;
271         } else if (tb->fastreuse &&
272                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
273                 tb->fastreuse = 0;
274 success:
275         if (!tcp_sk(sk)->bind_hash)
276                 tcp_bind_hash(sk, tb, snum);
277         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
278         ret = 0;
279
280 fail_unlock:
281         spin_unlock(&head->lock);
282 fail:
283         local_bh_enable();
284         return ret;
285 }
286
287 /* Get rid of any references to a local port held by the
288  * given sock.
289  */
290 static void __tcp_put_port(struct sock *sk)
291 {
292         struct inet_sock *inet = inet_sk(sk);
293         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
294         struct tcp_bind_bucket *tb;
295
296         spin_lock(&head->lock);
297         tb = tcp_sk(sk)->bind_hash;
298         __sk_del_bind_node(sk);
299         tcp_sk(sk)->bind_hash = NULL;
300         inet->num = 0;
301         tcp_bucket_destroy(tb);
302         spin_unlock(&head->lock);
303 }
304
305 void tcp_put_port(struct sock *sk)
306 {
307         local_bh_disable();
308         __tcp_put_port(sk);
309         local_bh_enable();
310 }
311
312 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
313  * Look, when several writers sleep and reader wakes them up, all but one
314  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
315  * this, _but_ remember, it adds useless work on UP machines (wake up each
316  * exclusive lock release). It should be ifdefed really.
317  */
318
319 void tcp_listen_wlock(void)
320 {
321         write_lock(&tcp_lhash_lock);
322
323         if (atomic_read(&tcp_lhash_users)) {
324                 DEFINE_WAIT(wait);
325
326                 for (;;) {
327                         prepare_to_wait_exclusive(&tcp_lhash_wait,
328                                                 &wait, TASK_UNINTERRUPTIBLE);
329                         if (!atomic_read(&tcp_lhash_users))
330                                 break;
331                         write_unlock_bh(&tcp_lhash_lock);
332                         schedule();
333                         write_lock_bh(&tcp_lhash_lock);
334                 }
335
336                 finish_wait(&tcp_lhash_wait, &wait);
337         }
338 }
339
340 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
341 {
342         struct hlist_head *list;
343         rwlock_t *lock;
344
345         BUG_TRAP(sk_unhashed(sk));
346         if (listen_possible && sk->sk_state == TCP_LISTEN) {
347                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
348                 lock = &tcp_lhash_lock;
349                 tcp_listen_wlock();
350         } else {
351                 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
352                 list = &tcp_ehash[sk->sk_hashent].chain;
353                 lock = &tcp_ehash[sk->sk_hashent].lock;
354                 write_lock(lock);
355         }
356         __sk_add_node(sk, list);
357         sock_prot_inc_use(sk->sk_prot);
358         write_unlock(lock);
359         if (listen_possible && sk->sk_state == TCP_LISTEN)
360                 wake_up(&tcp_lhash_wait);
361 }
362
363 static void tcp_v4_hash(struct sock *sk)
364 {
365         if (sk->sk_state != TCP_CLOSE) {
366                 local_bh_disable();
367                 __tcp_v4_hash(sk, 1);
368                 local_bh_enable();
369         }
370 }
371
372 void tcp_unhash(struct sock *sk)
373 {
374         rwlock_t *lock;
375
376         if (sk_unhashed(sk))
377                 goto ende;
378
379         if (sk->sk_state == TCP_LISTEN) {
380                 local_bh_disable();
381                 tcp_listen_wlock();
382                 lock = &tcp_lhash_lock;
383         } else {
384                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
385                 lock = &head->lock;
386                 write_lock_bh(&head->lock);
387         }
388
389         if (__sk_del_node_init(sk))
390                 sock_prot_dec_use(sk->sk_prot);
391         write_unlock_bh(lock);
392
393  ende:
394         if (sk->sk_state == TCP_LISTEN)
395                 wake_up(&tcp_lhash_wait);
396 }
397
398 /* Don't inline this cruft.  Here are some nice properties to
399  * exploit here.  The BSD API does not allow a listening TCP
400  * to specify the remote port nor the remote address for the
401  * connection.  So always assume those are both wildcarded
402  * during the search since they can never be otherwise.
403  */
404 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
405                                              unsigned short hnum, int dif)
406 {
407         struct sock *result = NULL, *sk;
408         struct hlist_node *node;
409         int score, hiscore;
410
411         hiscore=-1;
412         sk_for_each(sk, node, head) {
413                 struct inet_sock *inet = inet_sk(sk);
414
415                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
416                         __u32 rcv_saddr = inet->rcv_saddr;
417
418                         score = (sk->sk_family == PF_INET ? 1 : 0);
419                         if (rcv_saddr) {
420                                 if (rcv_saddr != daddr)
421                                         continue;
422                                 score+=2;
423                         }
424                         if (sk->sk_bound_dev_if) {
425                                 if (sk->sk_bound_dev_if != dif)
426                                         continue;
427                                 score+=2;
428                         }
429                         if (score == 5)
430                                 return sk;
431                         if (score > hiscore) {
432                                 hiscore = score;
433                                 result = sk;
434                         }
435                 }
436         }
437         return result;
438 }
439
440 /* Optimize the common listener case. */
441 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
442                 unsigned short hnum, int dif)
443 {
444         struct sock *sk = NULL;
445         struct hlist_head *head;
446
447         read_lock(&tcp_lhash_lock);
448         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
449         if (!hlist_empty(head)) {
450                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
451
452                 if (inet->num == hnum && !sk->sk_node.next &&
453                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
454                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
455                     !sk->sk_bound_dev_if)
456                         goto sherry_cache;
457                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
458         }
459         if (sk) {
460 sherry_cache:
461                 sock_hold(sk);
462         }
463         read_unlock(&tcp_lhash_lock);
464         return sk;
465 }
466
467 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
468  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
469  *
470  * Local BH must be disabled here.
471  */
472
473 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
474                                                        u32 daddr, u16 hnum,
475                                                        int dif)
476 {
477         struct tcp_ehash_bucket *head;
478         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
479         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
480         struct sock *sk;
481         struct hlist_node *node;
482         /* Optimize here for direct hit, only listening connections can
483          * have wildcards anyways.
484          */
485         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
486         head = &tcp_ehash[hash];
487         read_lock(&head->lock);
488         sk_for_each(sk, node, &head->chain) {
489                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
490                         goto hit; /* You sunk my battleship! */
491         }
492
493         /* Must check for a TIME_WAIT'er before going to listener hash. */
494         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
495                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
496                         goto hit;
497         }
498         sk = NULL;
499 out:
500         read_unlock(&head->lock);
501         return sk;
502 hit:
503         sock_hold(sk);
504         goto out;
505 }
506
507 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
508                                            u32 daddr, u16 hnum, int dif)
509 {
510         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
511                                                       daddr, hnum, dif);
512
513         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
514 }
515
516 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
517                                   u16 dport, int dif)
518 {
519         struct sock *sk;
520
521         local_bh_disable();
522         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
523         local_bh_enable();
524
525         return sk;
526 }
527
528 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
529
530 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
531 {
532         return secure_tcp_sequence_number(skb->nh.iph->daddr,
533                                           skb->nh.iph->saddr,
534                                           skb->h.th->dest,
535                                           skb->h.th->source);
536 }
537
538 /* called with local bh disabled */
539 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
540                                       struct tcp_tw_bucket **twp)
541 {
542         struct inet_sock *inet = inet_sk(sk);
543         u32 daddr = inet->rcv_saddr;
544         u32 saddr = inet->daddr;
545         int dif = sk->sk_bound_dev_if;
546         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
547         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
548         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
549         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
550         struct sock *sk2;
551         struct hlist_node *node;
552         struct tcp_tw_bucket *tw;
553
554         write_lock(&head->lock);
555
556         /* Check TIME-WAIT sockets first. */
557         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
558                 tw = (struct tcp_tw_bucket *)sk2;
559
560                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
561                         struct tcp_sock *tp = tcp_sk(sk);
562
563                         /* With PAWS, it is safe from the viewpoint
564                            of data integrity. Even without PAWS it
565                            is safe provided sequence spaces do not
566                            overlap i.e. at data rates <= 80Mbit/sec.
567
568                            Actually, the idea is close to VJ's one,
569                            only timestamp cache is held not per host,
570                            but per port pair and TW bucket is used
571                            as state holder.
572
573                            If TW bucket has been already destroyed we
574                            fall back to VJ's scheme and use initial
575                            timestamp retrieved from peer table.
576                          */
577                         if (tw->tw_ts_recent_stamp &&
578                             (!twp || (sysctl_tcp_tw_reuse &&
579                                       xtime.tv_sec -
580                                       tw->tw_ts_recent_stamp > 1))) {
581                                 if ((tp->write_seq =
582                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
583                                         tp->write_seq = 1;
584                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
585                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
586                                 sock_hold(sk2);
587                                 goto unique;
588                         } else
589                                 goto not_unique;
590                 }
591         }
592         tw = NULL;
593
594         /* And established part... */
595         sk_for_each(sk2, node, &head->chain) {
596                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
597                         goto not_unique;
598         }
599
600 unique:
601         /* Must record num and sport now. Otherwise we will see
602          * in hash table socket with a funny identity. */
603         inet->num = lport;
604         inet->sport = htons(lport);
605         sk->sk_hashent = hash;
606         BUG_TRAP(sk_unhashed(sk));
607         __sk_add_node(sk, &head->chain);
608         sock_prot_inc_use(sk->sk_prot);
609         write_unlock(&head->lock);
610
611         if (twp) {
612                 *twp = tw;
613                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
614         } else if (tw) {
615                 /* Silly. Should hash-dance instead... */
616                 tcp_tw_deschedule(tw);
617                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
618
619                 tcp_tw_put(tw);
620         }
621
622         return 0;
623
624 not_unique:
625         write_unlock(&head->lock);
626         return -EADDRNOTAVAIL;
627 }
628
629 static inline u32 connect_port_offset(const struct sock *sk)
630 {
631         const struct inet_sock *inet = inet_sk(sk);
632
633         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
634                                          inet->dport);
635 }
636
637 /*
638  * Bind a port for a connect operation and hash it.
639  */
640 static inline int tcp_v4_hash_connect(struct sock *sk)
641 {
642         unsigned short snum = inet_sk(sk)->num;
643         struct tcp_bind_hashbucket *head;
644         struct tcp_bind_bucket *tb;
645         int ret;
646
647         if (!snum) {
648                 int low = sysctl_local_port_range[0];
649                 int high = sysctl_local_port_range[1];
650                 int range = high - low;
651                 int i;
652                 int port;
653                 static u32 hint;
654                 u32 offset = hint + connect_port_offset(sk);
655                 struct hlist_node *node;
656                 struct tcp_tw_bucket *tw = NULL;
657
658                 local_bh_disable();
659                 for (i = 1; i <= range; i++) {
660                         port = low + (i + offset) % range;
661                         head = &tcp_bhash[tcp_bhashfn(port)];
662                         spin_lock(&head->lock);
663
664                         /* Does not bother with rcv_saddr checks,
665                          * because the established check is already
666                          * unique enough.
667                          */
668                         tb_for_each(tb, node, &head->chain) {
669                                 if (tb->port == port) {
670                                         BUG_TRAP(!hlist_empty(&tb->owners));
671                                         if (tb->fastreuse >= 0)
672                                                 goto next_port;
673                                         if (!__tcp_v4_check_established(sk,
674                                                                         port,
675                                                                         &tw))
676                                                 goto ok;
677                                         goto next_port;
678                                 }
679                         }
680
681                         tb = tcp_bucket_create(head, port);
682                         if (!tb) {
683                                 spin_unlock(&head->lock);
684                                 break;
685                         }
686                         tb->fastreuse = -1;
687                         goto ok;
688
689                 next_port:
690                         spin_unlock(&head->lock);
691                 }
692                 local_bh_enable();
693
694                 return -EADDRNOTAVAIL;
695
696 ok:
697                 hint += i;
698
699                 /* Head lock still held and bh's disabled */
700                 tcp_bind_hash(sk, tb, port);
701                 if (sk_unhashed(sk)) {
702                         inet_sk(sk)->sport = htons(port);
703                         __tcp_v4_hash(sk, 0);
704                 }
705                 spin_unlock(&head->lock);
706
707                 if (tw) {
708                         tcp_tw_deschedule(tw);
709                         tcp_tw_put(tw);
710                 }
711
712                 ret = 0;
713                 goto out;
714         }
715
716         head  = &tcp_bhash[tcp_bhashfn(snum)];
717         tb  = tcp_sk(sk)->bind_hash;
718         spin_lock_bh(&head->lock);
719         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
720                 __tcp_v4_hash(sk, 0);
721                 spin_unlock_bh(&head->lock);
722                 return 0;
723         } else {
724                 spin_unlock(&head->lock);
725                 /* No definite answer... Walk to established hash table */
726                 ret = __tcp_v4_check_established(sk, snum, NULL);
727 out:
728                 local_bh_enable();
729                 return ret;
730         }
731 }
732
733 /* This will initiate an outgoing connection. */
734 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
735 {
736         struct inet_sock *inet = inet_sk(sk);
737         struct tcp_sock *tp = tcp_sk(sk);
738         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
739         struct rtable *rt;
740         u32 daddr, nexthop;
741         int tmp;
742         int err;
743
744         if (addr_len < sizeof(struct sockaddr_in))
745                 return -EINVAL;
746
747         if (usin->sin_family != AF_INET)
748                 return -EAFNOSUPPORT;
749
750         nexthop = daddr = usin->sin_addr.s_addr;
751         if (inet->opt && inet->opt->srr) {
752                 if (!daddr)
753                         return -EINVAL;
754                 nexthop = inet->opt->faddr;
755         }
756
757         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
758                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
759                                IPPROTO_TCP,
760                                inet->sport, usin->sin_port, sk);
761         if (tmp < 0)
762                 return tmp;
763
764         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
765                 ip_rt_put(rt);
766                 return -ENETUNREACH;
767         }
768
769         if (!inet->opt || !inet->opt->srr)
770                 daddr = rt->rt_dst;
771
772         if (!inet->saddr)
773                 inet->saddr = rt->rt_src;
774         inet->rcv_saddr = inet->saddr;
775
776         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
777                 /* Reset inherited state */
778                 tp->rx_opt.ts_recent       = 0;
779                 tp->rx_opt.ts_recent_stamp = 0;
780                 tp->write_seq              = 0;
781         }
782
783         if (sysctl_tcp_tw_recycle &&
784             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
785                 struct inet_peer *peer = rt_get_peer(rt);
786
787                 /* VJ's idea. We save last timestamp seen from
788                  * the destination in peer table, when entering state TIME-WAIT
789                  * and initialize rx_opt.ts_recent from it, when trying new connection.
790                  */
791
792                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
793                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
794                         tp->rx_opt.ts_recent = peer->tcp_ts;
795                 }
796         }
797
798         inet->dport = usin->sin_port;
799         inet->daddr = daddr;
800
801         tp->ext_header_len = 0;
802         if (inet->opt)
803                 tp->ext_header_len = inet->opt->optlen;
804
805         tp->rx_opt.mss_clamp = 536;
806
807         /* Socket identity is still unknown (sport may be zero).
808          * However we set state to SYN-SENT and not releasing socket
809          * lock select source port, enter ourselves into the hash tables and
810          * complete initialization after this.
811          */
812         tcp_set_state(sk, TCP_SYN_SENT);
813         err = tcp_v4_hash_connect(sk);
814         if (err)
815                 goto failure;
816
817         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
818         if (err)
819                 goto failure;
820
821         /* OK, now commit destination to socket.  */
822         sk_setup_caps(sk, &rt->u.dst);
823
824         if (!tp->write_seq)
825                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
826                                                            inet->daddr,
827                                                            inet->sport,
828                                                            usin->sin_port);
829
830         inet->id = tp->write_seq ^ jiffies;
831
832         err = tcp_connect(sk);
833         rt = NULL;
834         if (err)
835                 goto failure;
836
837         return 0;
838
839 failure:
840         /* This unhashes the socket and releases the local port, if necessary. */
841         tcp_set_state(sk, TCP_CLOSE);
842         ip_rt_put(rt);
843         sk->sk_route_caps = 0;
844         inet->dport = 0;
845         return err;
846 }
847
848 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
849 {
850         return ((struct rtable *)skb->dst)->rt_iif;
851 }
852
853 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
854 {
855         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
856 }
857
858 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
859                                               struct request_sock ***prevp,
860                                               __u16 rport,
861                                               __u32 raddr, __u32 laddr)
862 {
863         struct listen_sock *lopt = tp->accept_queue.listen_opt;
864         struct request_sock *req, **prev;
865
866         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
867              (req = *prev) != NULL;
868              prev = &req->dl_next) {
869                 const struct inet_request_sock *ireq = inet_rsk(req);
870
871                 if (ireq->rmt_port == rport &&
872                     ireq->rmt_addr == raddr &&
873                     ireq->loc_addr == laddr &&
874                     TCP_INET_FAMILY(req->rsk_ops->family)) {
875                         BUG_TRAP(!req->sk);
876                         *prevp = prev;
877                         break;
878                 }
879         }
880
881         return req;
882 }
883
884 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
885 {
886         struct tcp_sock *tp = tcp_sk(sk);
887         struct listen_sock *lopt = tp->accept_queue.listen_opt;
888         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
889
890         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
891         tcp_synq_added(sk);
892 }
893
894
895 /*
896  * This routine does path mtu discovery as defined in RFC1191.
897  */
898 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
899                                      u32 mtu)
900 {
901         struct dst_entry *dst;
902         struct inet_sock *inet = inet_sk(sk);
903         struct tcp_sock *tp = tcp_sk(sk);
904
905         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
906          * send out by Linux are always <576bytes so they should go through
907          * unfragmented).
908          */
909         if (sk->sk_state == TCP_LISTEN)
910                 return;
911
912         /* We don't check in the destentry if pmtu discovery is forbidden
913          * on this route. We just assume that no packet_to_big packets
914          * are send back when pmtu discovery is not active.
915          * There is a small race when the user changes this flag in the
916          * route, but I think that's acceptable.
917          */
918         if ((dst = __sk_dst_check(sk, 0)) == NULL)
919                 return;
920
921         dst->ops->update_pmtu(dst, mtu);
922
923         /* Something is about to be wrong... Remember soft error
924          * for the case, if this connection will not able to recover.
925          */
926         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
927                 sk->sk_err_soft = EMSGSIZE;
928
929         mtu = dst_mtu(dst);
930
931         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
932             tp->pmtu_cookie > mtu) {
933                 tcp_sync_mss(sk, mtu);
934
935                 /* Resend the TCP packet because it's
936                  * clear that the old packet has been
937                  * dropped. This is the new "fast" path mtu
938                  * discovery.
939                  */
940                 tcp_simple_retransmit(sk);
941         } /* else let the usual retransmit timer handle it */
942 }
943
944 /*
945  * This routine is called by the ICMP module when it gets some
946  * sort of error condition.  If err < 0 then the socket should
947  * be closed and the error returned to the user.  If err > 0
948  * it's just the icmp type << 8 | icmp code.  After adjustment
949  * header points to the first 8 bytes of the tcp header.  We need
950  * to find the appropriate port.
951  *
952  * The locking strategy used here is very "optimistic". When
953  * someone else accesses the socket the ICMP is just dropped
954  * and for some paths there is no check at all.
955  * A more general error queue to queue errors for later handling
956  * is probably better.
957  *
958  */
959
960 void tcp_v4_err(struct sk_buff *skb, u32 info)
961 {
962         struct iphdr *iph = (struct iphdr *)skb->data;
963         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
964         struct tcp_sock *tp;
965         struct inet_sock *inet;
966         int type = skb->h.icmph->type;
967         int code = skb->h.icmph->code;
968         struct sock *sk;
969         __u32 seq;
970         int err;
971
972         if (skb->len < (iph->ihl << 2) + 8) {
973                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
974                 return;
975         }
976
977         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
978                            th->source, tcp_v4_iif(skb));
979         if (!sk) {
980                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
981                 return;
982         }
983         if (sk->sk_state == TCP_TIME_WAIT) {
984                 tcp_tw_put((struct tcp_tw_bucket *)sk);
985                 return;
986         }
987
988         bh_lock_sock(sk);
989         /* If too many ICMPs get dropped on busy
990          * servers this needs to be solved differently.
991          */
992         if (sock_owned_by_user(sk))
993                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
994
995         if (sk->sk_state == TCP_CLOSE)
996                 goto out;
997
998         tp = tcp_sk(sk);
999         seq = ntohl(th->seq);
1000         if (sk->sk_state != TCP_LISTEN &&
1001             !between(seq, tp->snd_una, tp->snd_nxt)) {
1002                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1003                 goto out;
1004         }
1005
1006         switch (type) {
1007         case ICMP_SOURCE_QUENCH:
1008                 /* Just silently ignore these. */
1009                 goto out;
1010         case ICMP_PARAMETERPROB:
1011                 err = EPROTO;
1012                 break;
1013         case ICMP_DEST_UNREACH:
1014                 if (code > NR_ICMP_UNREACH)
1015                         goto out;
1016
1017                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1018                         if (!sock_owned_by_user(sk))
1019                                 do_pmtu_discovery(sk, iph, info);
1020                         goto out;
1021                 }
1022
1023                 err = icmp_err_convert[code].errno;
1024                 break;
1025         case ICMP_TIME_EXCEEDED:
1026                 err = EHOSTUNREACH;
1027                 break;
1028         default:
1029                 goto out;
1030         }
1031
1032         switch (sk->sk_state) {
1033                 struct request_sock *req, **prev;
1034         case TCP_LISTEN:
1035                 if (sock_owned_by_user(sk))
1036                         goto out;
1037
1038                 req = tcp_v4_search_req(tp, &prev, th->dest,
1039                                         iph->daddr, iph->saddr);
1040                 if (!req)
1041                         goto out;
1042
1043                 /* ICMPs are not backlogged, hence we cannot get
1044                    an established socket here.
1045                  */
1046                 BUG_TRAP(!req->sk);
1047
1048                 if (seq != tcp_rsk(req)->snt_isn) {
1049                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1050                         goto out;
1051                 }
1052
1053                 /*
1054                  * Still in SYN_RECV, just remove it silently.
1055                  * There is no good way to pass the error to the newly
1056                  * created socket, and POSIX does not want network
1057                  * errors returned from accept().
1058                  */
1059                 tcp_synq_drop(sk, req, prev);
1060                 goto out;
1061
1062         case TCP_SYN_SENT:
1063         case TCP_SYN_RECV:  /* Cannot happen.
1064                                It can f.e. if SYNs crossed.
1065                              */
1066                 if (!sock_owned_by_user(sk)) {
1067                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1068                         sk->sk_err = err;
1069
1070                         sk->sk_error_report(sk);
1071
1072                         tcp_done(sk);
1073                 } else {
1074                         sk->sk_err_soft = err;
1075                 }
1076                 goto out;
1077         }
1078
1079         /* If we've already connected we will keep trying
1080          * until we time out, or the user gives up.
1081          *
1082          * rfc1122 4.2.3.9 allows to consider as hard errors
1083          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1084          * but it is obsoleted by pmtu discovery).
1085          *
1086          * Note, that in modern internet, where routing is unreliable
1087          * and in each dark corner broken firewalls sit, sending random
1088          * errors ordered by their masters even this two messages finally lose
1089          * their original sense (even Linux sends invalid PORT_UNREACHs)
1090          *
1091          * Now we are in compliance with RFCs.
1092          *                                                      --ANK (980905)
1093          */
1094
1095         inet = inet_sk(sk);
1096         if (!sock_owned_by_user(sk) && inet->recverr) {
1097                 sk->sk_err = err;
1098                 sk->sk_error_report(sk);
1099         } else  { /* Only an error on timeout */
1100                 sk->sk_err_soft = err;
1101         }
1102
1103 out:
1104         bh_unlock_sock(sk);
1105         sock_put(sk);
1106 }
1107
1108 /* This routine computes an IPv4 TCP checksum. */
1109 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1110                        struct sk_buff *skb)
1111 {
1112         struct inet_sock *inet = inet_sk(sk);
1113
1114         if (skb->ip_summed == CHECKSUM_HW) {
1115                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1116                 skb->csum = offsetof(struct tcphdr, check);
1117         } else {
1118                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1119                                          csum_partial((char *)th,
1120                                                       th->doff << 2,
1121                                                       skb->csum));
1122         }
1123 }
1124
1125 /*
1126  *      This routine will send an RST to the other tcp.
1127  *
1128  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1129  *                    for reset.
1130  *      Answer: if a packet caused RST, it is not for a socket
1131  *              existing in our system, if it is matched to a socket,
1132  *              it is just duplicate segment or bug in other side's TCP.
1133  *              So that we build reply only basing on parameters
1134  *              arrived with segment.
1135  *      Exception: precedence violation. We do not implement it in any case.
1136  */
1137
1138 static void tcp_v4_send_reset(struct sk_buff *skb)
1139 {
1140         struct tcphdr *th = skb->h.th;
1141         struct tcphdr rth;
1142         struct ip_reply_arg arg;
1143
1144         /* Never send a reset in response to a reset. */
1145         if (th->rst)
1146                 return;
1147
1148         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1149                 return;
1150
1151         /* Swap the send and the receive. */
1152         memset(&rth, 0, sizeof(struct tcphdr));
1153         rth.dest   = th->source;
1154         rth.source = th->dest;
1155         rth.doff   = sizeof(struct tcphdr) / 4;
1156         rth.rst    = 1;
1157
1158         if (th->ack) {
1159                 rth.seq = th->ack_seq;
1160         } else {
1161                 rth.ack = 1;
1162                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1163                                     skb->len - (th->doff << 2));
1164         }
1165
1166         memset(&arg, 0, sizeof arg);
1167         arg.iov[0].iov_base = (unsigned char *)&rth;
1168         arg.iov[0].iov_len  = sizeof rth;
1169         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1170                                       skb->nh.iph->saddr, /*XXX*/
1171                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1172         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1173
1174         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1175
1176         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1177         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1178 }
1179
1180 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1181    outside socket context is ugly, certainly. What can I do?
1182  */
1183
1184 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1185                             u32 win, u32 ts)
1186 {
1187         struct tcphdr *th = skb->h.th;
1188         struct {
1189                 struct tcphdr th;
1190                 u32 tsopt[3];
1191         } rep;
1192         struct ip_reply_arg arg;
1193
1194         memset(&rep.th, 0, sizeof(struct tcphdr));
1195         memset(&arg, 0, sizeof arg);
1196
1197         arg.iov[0].iov_base = (unsigned char *)&rep;
1198         arg.iov[0].iov_len  = sizeof(rep.th);
1199         if (ts) {
1200                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1201                                      (TCPOPT_TIMESTAMP << 8) |
1202                                      TCPOLEN_TIMESTAMP);
1203                 rep.tsopt[1] = htonl(tcp_time_stamp);
1204                 rep.tsopt[2] = htonl(ts);
1205                 arg.iov[0].iov_len = sizeof(rep);
1206         }
1207
1208         /* Swap the send and the receive. */
1209         rep.th.dest    = th->source;
1210         rep.th.source  = th->dest;
1211         rep.th.doff    = arg.iov[0].iov_len / 4;
1212         rep.th.seq     = htonl(seq);
1213         rep.th.ack_seq = htonl(ack);
1214         rep.th.ack     = 1;
1215         rep.th.window  = htons(win);
1216
1217         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1218                                       skb->nh.iph->saddr, /*XXX*/
1219                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1220         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1221
1222         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1223
1224         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1225 }
1226
1227 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1228 {
1229         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1230
1231         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1232                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1233
1234         tcp_tw_put(tw);
1235 }
1236
1237 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1238 {
1239         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1240                         req->ts_recent);
1241 }
1242
1243 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1244                                           struct request_sock *req)
1245 {
1246         struct rtable *rt;
1247         const struct inet_request_sock *ireq = inet_rsk(req);
1248         struct ip_options *opt = inet_rsk(req)->opt;
1249         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1250                             .nl_u = { .ip4_u =
1251                                       { .daddr = ((opt && opt->srr) ?
1252                                                   opt->faddr :
1253                                                   ireq->rmt_addr),
1254                                         .saddr = ireq->loc_addr,
1255                                         .tos = RT_CONN_FLAGS(sk) } },
1256                             .proto = IPPROTO_TCP,
1257                             .uli_u = { .ports =
1258                                        { .sport = inet_sk(sk)->sport,
1259                                          .dport = ireq->rmt_port } } };
1260
1261         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1262                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1263                 return NULL;
1264         }
1265         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1266                 ip_rt_put(rt);
1267                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1268                 return NULL;
1269         }
1270         return &rt->u.dst;
1271 }
1272
1273 /*
1274  *      Send a SYN-ACK after having received an ACK.
1275  *      This still operates on a request_sock only, not on a big
1276  *      socket.
1277  */
1278 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1279                               struct dst_entry *dst)
1280 {
1281         const struct inet_request_sock *ireq = inet_rsk(req);
1282         int err = -1;
1283         struct sk_buff * skb;
1284
1285         /* First, grab a route. */
1286         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1287                 goto out;
1288
1289         skb = tcp_make_synack(sk, dst, req);
1290
1291         if (skb) {
1292                 struct tcphdr *th = skb->h.th;
1293
1294                 th->check = tcp_v4_check(th, skb->len,
1295                                          ireq->loc_addr,
1296                                          ireq->rmt_addr,
1297                                          csum_partial((char *)th, skb->len,
1298                                                       skb->csum));
1299
1300                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1301                                             ireq->rmt_addr,
1302                                             ireq->opt);
1303                 if (err == NET_XMIT_CN)
1304                         err = 0;
1305         }
1306
1307 out:
1308         dst_release(dst);
1309         return err;
1310 }
1311
1312 /*
1313  *      IPv4 request_sock destructor.
1314  */
1315 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1316 {
1317         if (inet_rsk(req)->opt)
1318                 kfree(inet_rsk(req)->opt);
1319 }
1320
1321 static inline void syn_flood_warning(struct sk_buff *skb)
1322 {
1323         static unsigned long warntime;
1324
1325         if (time_after(jiffies, (warntime + HZ * 60))) {
1326                 warntime = jiffies;
1327                 printk(KERN_INFO
1328                        "possible SYN flooding on port %d. Sending cookies.\n",
1329                        ntohs(skb->h.th->dest));
1330         }
1331 }
1332
1333 /*
1334  * Save and compile IPv4 options into the request_sock if needed.
1335  */
1336 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1337                                                      struct sk_buff *skb)
1338 {
1339         struct ip_options *opt = &(IPCB(skb)->opt);
1340         struct ip_options *dopt = NULL;
1341
1342         if (opt && opt->optlen) {
1343                 int opt_size = optlength(opt);
1344                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1345                 if (dopt) {
1346                         if (ip_options_echo(dopt, skb)) {
1347                                 kfree(dopt);
1348                                 dopt = NULL;
1349                         }
1350                 }
1351         }
1352         return dopt;
1353 }
1354
1355 struct request_sock_ops tcp_request_sock_ops = {
1356         .family         =       PF_INET,
1357         .obj_size       =       sizeof(struct tcp_request_sock),
1358         .rtx_syn_ack    =       tcp_v4_send_synack,
1359         .send_ack       =       tcp_v4_reqsk_send_ack,
1360         .destructor     =       tcp_v4_reqsk_destructor,
1361         .send_reset     =       tcp_v4_send_reset,
1362 };
1363
1364 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1365 {
1366         struct inet_request_sock *ireq;
1367         struct tcp_options_received tmp_opt;
1368         struct request_sock *req;
1369         __u32 saddr = skb->nh.iph->saddr;
1370         __u32 daddr = skb->nh.iph->daddr;
1371         __u32 isn = TCP_SKB_CB(skb)->when;
1372         struct dst_entry *dst = NULL;
1373 #ifdef CONFIG_SYN_COOKIES
1374         int want_cookie = 0;
1375 #else
1376 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1377 #endif
1378
1379         /* Never answer to SYNs send to broadcast or multicast */
1380         if (((struct rtable *)skb->dst)->rt_flags &
1381             (RTCF_BROADCAST | RTCF_MULTICAST))
1382                 goto drop;
1383
1384         /* TW buckets are converted to open requests without
1385          * limitations, they conserve resources and peer is
1386          * evidently real one.
1387          */
1388         if (tcp_synq_is_full(sk) && !isn) {
1389 #ifdef CONFIG_SYN_COOKIES
1390                 if (sysctl_tcp_syncookies) {
1391                         want_cookie = 1;
1392                 } else
1393 #endif
1394                 goto drop;
1395         }
1396
1397         /* Accept backlog is full. If we have already queued enough
1398          * of warm entries in syn queue, drop request. It is better than
1399          * clogging syn queue with openreqs with exponentially increasing
1400          * timeout.
1401          */
1402         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1403                 goto drop;
1404
1405         req = reqsk_alloc(&tcp_request_sock_ops);
1406         if (!req)
1407                 goto drop;
1408
1409         tcp_clear_options(&tmp_opt);
1410         tmp_opt.mss_clamp = 536;
1411         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1412
1413         tcp_parse_options(skb, &tmp_opt, 0);
1414
1415         if (want_cookie) {
1416                 tcp_clear_options(&tmp_opt);
1417                 tmp_opt.saw_tstamp = 0;
1418         }
1419
1420         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1421                 /* Some OSes (unknown ones, but I see them on web server, which
1422                  * contains information interesting only for windows'
1423                  * users) do not send their stamp in SYN. It is easy case.
1424                  * We simply do not advertise TS support.
1425                  */
1426                 tmp_opt.saw_tstamp = 0;
1427                 tmp_opt.tstamp_ok  = 0;
1428         }
1429         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1430
1431         tcp_openreq_init(req, &tmp_opt, skb);
1432
1433         ireq = inet_rsk(req);
1434         ireq->loc_addr = daddr;
1435         ireq->rmt_addr = saddr;
1436         ireq->opt = tcp_v4_save_options(sk, skb);
1437         if (!want_cookie)
1438                 TCP_ECN_create_request(req, skb->h.th);
1439
1440         if (want_cookie) {
1441 #ifdef CONFIG_SYN_COOKIES
1442                 syn_flood_warning(skb);
1443 #endif
1444                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1445         } else if (!isn) {
1446                 struct inet_peer *peer = NULL;
1447
1448                 /* VJ's idea. We save last timestamp seen
1449                  * from the destination in peer table, when entering
1450                  * state TIME-WAIT, and check against it before
1451                  * accepting new connection request.
1452                  *
1453                  * If "isn" is not zero, this request hit alive
1454                  * timewait bucket, so that all the necessary checks
1455                  * are made in the function processing timewait state.
1456                  */
1457                 if (tmp_opt.saw_tstamp &&
1458                     sysctl_tcp_tw_recycle &&
1459                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1460                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1461                     peer->v4daddr == saddr) {
1462                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1463                             (s32)(peer->tcp_ts - req->ts_recent) >
1464                                                         TCP_PAWS_WINDOW) {
1465                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1466                                 dst_release(dst);
1467                                 goto drop_and_free;
1468                         }
1469                 }
1470                 /* Kill the following clause, if you dislike this way. */
1471                 else if (!sysctl_tcp_syncookies &&
1472                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1473                           (sysctl_max_syn_backlog >> 2)) &&
1474                          (!peer || !peer->tcp_ts_stamp) &&
1475                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1476                         /* Without syncookies last quarter of
1477                          * backlog is filled with destinations,
1478                          * proven to be alive.
1479                          * It means that we continue to communicate
1480                          * to destinations, already remembered
1481                          * to the moment of synflood.
1482                          */
1483                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1484                                               "request from %u.%u."
1485                                               "%u.%u/%u\n",
1486                                               NIPQUAD(saddr),
1487                                               ntohs(skb->h.th->source)));
1488                         dst_release(dst);
1489                         goto drop_and_free;
1490                 }
1491
1492                 isn = tcp_v4_init_sequence(sk, skb);
1493         }
1494         tcp_rsk(req)->snt_isn = isn;
1495
1496         if (tcp_v4_send_synack(sk, req, dst))
1497                 goto drop_and_free;
1498
1499         if (want_cookie) {
1500                 reqsk_free(req);
1501         } else {
1502                 tcp_v4_synq_add(sk, req);
1503         }
1504         return 0;
1505
1506 drop_and_free:
1507         reqsk_free(req);
1508 drop:
1509         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1510         return 0;
1511 }
1512
1513
1514 /*
1515  * The three way handshake has completed - we got a valid synack -
1516  * now create the new socket.
1517  */
1518 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1519                                   struct request_sock *req,
1520                                   struct dst_entry *dst)
1521 {
1522         struct inet_request_sock *ireq;
1523         struct inet_sock *newinet;
1524         struct tcp_sock *newtp;
1525         struct sock *newsk;
1526
1527         if (sk_acceptq_is_full(sk))
1528                 goto exit_overflow;
1529
1530         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1531                 goto exit;
1532
1533         newsk = tcp_create_openreq_child(sk, req, skb);
1534         if (!newsk)
1535                 goto exit;
1536
1537         sk_setup_caps(newsk, dst);
1538
1539         newtp                 = tcp_sk(newsk);
1540         newinet               = inet_sk(newsk);
1541         ireq                  = inet_rsk(req);
1542         newinet->daddr        = ireq->rmt_addr;
1543         newinet->rcv_saddr    = ireq->loc_addr;
1544         newinet->saddr        = ireq->loc_addr;
1545         newinet->opt          = ireq->opt;
1546         ireq->opt             = NULL;
1547         newinet->mc_index     = tcp_v4_iif(skb);
1548         newinet->mc_ttl       = skb->nh.iph->ttl;
1549         newtp->ext_header_len = 0;
1550         if (newinet->opt)
1551                 newtp->ext_header_len = newinet->opt->optlen;
1552         newinet->id = newtp->write_seq ^ jiffies;
1553
1554         tcp_sync_mss(newsk, dst_mtu(dst));
1555         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1556         tcp_initialize_rcv_mss(newsk);
1557
1558         __tcp_v4_hash(newsk, 0);
1559         __tcp_inherit_port(sk, newsk);
1560
1561         return newsk;
1562
1563 exit_overflow:
1564         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1565 exit:
1566         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1567         dst_release(dst);
1568         return NULL;
1569 }
1570
1571 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1572 {
1573         struct tcphdr *th = skb->h.th;
1574         struct iphdr *iph = skb->nh.iph;
1575         struct tcp_sock *tp = tcp_sk(sk);
1576         struct sock *nsk;
1577         struct request_sock **prev;
1578         /* Find possible connection requests. */
1579         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1580                                                      iph->saddr, iph->daddr);
1581         if (req)
1582                 return tcp_check_req(sk, skb, req, prev);
1583
1584         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1585                                           th->source,
1586                                           skb->nh.iph->daddr,
1587                                           ntohs(th->dest),
1588                                           tcp_v4_iif(skb));
1589
1590         if (nsk) {
1591                 if (nsk->sk_state != TCP_TIME_WAIT) {
1592                         bh_lock_sock(nsk);
1593                         return nsk;
1594                 }
1595                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1596                 return NULL;
1597         }
1598
1599 #ifdef CONFIG_SYN_COOKIES
1600         if (!th->rst && !th->syn && th->ack)
1601                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1602 #endif
1603         return sk;
1604 }
1605
1606 static int tcp_v4_checksum_init(struct sk_buff *skb)
1607 {
1608         if (skb->ip_summed == CHECKSUM_HW) {
1609                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1610                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1611                                   skb->nh.iph->daddr, skb->csum))
1612                         return 0;
1613
1614                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1615                 skb->ip_summed = CHECKSUM_NONE;
1616         }
1617         if (skb->len <= 76) {
1618                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1619                                  skb->nh.iph->daddr,
1620                                  skb_checksum(skb, 0, skb->len, 0)))
1621                         return -1;
1622                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1623         } else {
1624                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1625                                           skb->nh.iph->saddr,
1626                                           skb->nh.iph->daddr, 0);
1627         }
1628         return 0;
1629 }
1630
1631
1632 /* The socket must have it's spinlock held when we get
1633  * here.
1634  *
1635  * We have a potential double-lock case here, so even when
1636  * doing backlog processing we use the BH locking scheme.
1637  * This is because we cannot sleep with the original spinlock
1638  * held.
1639  */
1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641 {
1642         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1643                 TCP_CHECK_TIMER(sk);
1644                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1645                         goto reset;
1646                 TCP_CHECK_TIMER(sk);
1647                 return 0;
1648         }
1649
1650         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1651                 goto csum_err;
1652
1653         if (sk->sk_state == TCP_LISTEN) {
1654                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1655                 if (!nsk)
1656                         goto discard;
1657
1658                 if (nsk != sk) {
1659                         if (tcp_child_process(sk, nsk, skb))
1660                                 goto reset;
1661                         return 0;
1662                 }
1663         }
1664
1665         TCP_CHECK_TIMER(sk);
1666         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1667                 goto reset;
1668         TCP_CHECK_TIMER(sk);
1669         return 0;
1670
1671 reset:
1672         tcp_v4_send_reset(skb);
1673 discard:
1674         kfree_skb(skb);
1675         /* Be careful here. If this function gets more complicated and
1676          * gcc suffers from register pressure on the x86, sk (in %ebx)
1677          * might be destroyed here. This current version compiles correctly,
1678          * but you have been warned.
1679          */
1680         return 0;
1681
1682 csum_err:
1683         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1684         goto discard;
1685 }
1686
1687 /*
1688  *      From tcp_input.c
1689  */
1690
1691 int tcp_v4_rcv(struct sk_buff *skb)
1692 {
1693         struct tcphdr *th;
1694         struct sock *sk;
1695         int ret;
1696
1697         if (skb->pkt_type != PACKET_HOST)
1698                 goto discard_it;
1699
1700         /* Count it even if it's bad */
1701         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1702
1703         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1704                 goto discard_it;
1705
1706         th = skb->h.th;
1707
1708         if (th->doff < sizeof(struct tcphdr) / 4)
1709                 goto bad_packet;
1710         if (!pskb_may_pull(skb, th->doff * 4))
1711                 goto discard_it;
1712
1713         /* An explanation is required here, I think.
1714          * Packet length and doff are validated by header prediction,
1715          * provided case of th->doff==0 is elimineted.
1716          * So, we defer the checks. */
1717         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1718              tcp_v4_checksum_init(skb) < 0))
1719                 goto bad_packet;
1720
1721         th = skb->h.th;
1722         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1723         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1724                                     skb->len - th->doff * 4);
1725         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1726         TCP_SKB_CB(skb)->when    = 0;
1727         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1728         TCP_SKB_CB(skb)->sacked  = 0;
1729
1730         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1731                              skb->nh.iph->daddr, ntohs(th->dest),
1732                              tcp_v4_iif(skb));
1733
1734         if (!sk)
1735                 goto no_tcp_socket;
1736
1737 process:
1738         if (sk->sk_state == TCP_TIME_WAIT)
1739                 goto do_time_wait;
1740
1741         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1742                 goto discard_and_relse;
1743
1744         if (sk_filter(sk, skb, 0))
1745                 goto discard_and_relse;
1746
1747         skb->dev = NULL;
1748
1749         bh_lock_sock(sk);
1750         ret = 0;
1751         if (!sock_owned_by_user(sk)) {
1752                 if (!tcp_prequeue(sk, skb))
1753                         ret = tcp_v4_do_rcv(sk, skb);
1754         } else
1755                 sk_add_backlog(sk, skb);
1756         bh_unlock_sock(sk);
1757
1758         sock_put(sk);
1759
1760         return ret;
1761
1762 no_tcp_socket:
1763         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1764                 goto discard_it;
1765
1766         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1767 bad_packet:
1768                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1769         } else {
1770                 tcp_v4_send_reset(skb);
1771         }
1772
1773 discard_it:
1774         /* Discard frame. */
1775         kfree_skb(skb);
1776         return 0;
1777
1778 discard_and_relse:
1779         sock_put(sk);
1780         goto discard_it;
1781
1782 do_time_wait:
1783         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1784                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1785                 goto discard_it;
1786         }
1787
1788         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1789                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1790                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1791                 goto discard_it;
1792         }
1793         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1794                                            skb, th, skb->len)) {
1795         case TCP_TW_SYN: {
1796                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1797                                                           ntohs(th->dest),
1798                                                           tcp_v4_iif(skb));
1799                 if (sk2) {
1800                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1801                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1802                         sk = sk2;
1803                         goto process;
1804                 }
1805                 /* Fall through to ACK */
1806         }
1807         case TCP_TW_ACK:
1808                 tcp_v4_timewait_ack(sk, skb);
1809                 break;
1810         case TCP_TW_RST:
1811                 goto no_tcp_socket;
1812         case TCP_TW_SUCCESS:;
1813         }
1814         goto discard_it;
1815 }
1816
1817 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1818 {
1819         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1820         struct inet_sock *inet = inet_sk(sk);
1821
1822         sin->sin_family         = AF_INET;
1823         sin->sin_addr.s_addr    = inet->daddr;
1824         sin->sin_port           = inet->dport;
1825 }
1826
1827 /* VJ's idea. Save last timestamp seen from this destination
1828  * and hold it at least for normal timewait interval to use for duplicate
1829  * segment detection in subsequent connections, before they enter synchronized
1830  * state.
1831  */
1832
1833 int tcp_v4_remember_stamp(struct sock *sk)
1834 {
1835         struct inet_sock *inet = inet_sk(sk);
1836         struct tcp_sock *tp = tcp_sk(sk);
1837         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1838         struct inet_peer *peer = NULL;
1839         int release_it = 0;
1840
1841         if (!rt || rt->rt_dst != inet->daddr) {
1842                 peer = inet_getpeer(inet->daddr, 1);
1843                 release_it = 1;
1844         } else {
1845                 if (!rt->peer)
1846                         rt_bind_peer(rt, 1);
1847                 peer = rt->peer;
1848         }
1849
1850         if (peer) {
1851                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1852                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1853                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1854                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1855                         peer->tcp_ts = tp->rx_opt.ts_recent;
1856                 }
1857                 if (release_it)
1858                         inet_putpeer(peer);
1859                 return 1;
1860         }
1861
1862         return 0;
1863 }
1864
1865 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1866 {
1867         struct inet_peer *peer = NULL;
1868
1869         peer = inet_getpeer(tw->tw_daddr, 1);
1870
1871         if (peer) {
1872                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1873                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1874                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1875                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1876                         peer->tcp_ts = tw->tw_ts_recent;
1877                 }
1878                 inet_putpeer(peer);
1879                 return 1;
1880         }
1881
1882         return 0;
1883 }
1884
1885 struct tcp_func ipv4_specific = {
1886         .queue_xmit     =       ip_queue_xmit,
1887         .send_check     =       tcp_v4_send_check,
1888         .rebuild_header =       inet_sk_rebuild_header,
1889         .conn_request   =       tcp_v4_conn_request,
1890         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1891         .remember_stamp =       tcp_v4_remember_stamp,
1892         .net_header_len =       sizeof(struct iphdr),
1893         .setsockopt     =       ip_setsockopt,
1894         .getsockopt     =       ip_getsockopt,
1895         .addr2sockaddr  =       v4_addr2sockaddr,
1896         .sockaddr_len   =       sizeof(struct sockaddr_in),
1897 };
1898
1899 /* NOTE: A lot of things set to zero explicitly by call to
1900  *       sk_alloc() so need not be done here.
1901  */
1902 static int tcp_v4_init_sock(struct sock *sk)
1903 {
1904         struct tcp_sock *tp = tcp_sk(sk);
1905
1906         skb_queue_head_init(&tp->out_of_order_queue);
1907         tcp_init_xmit_timers(sk);
1908         tcp_prequeue_init(tp);
1909
1910         tp->rto  = TCP_TIMEOUT_INIT;
1911         tp->mdev = TCP_TIMEOUT_INIT;
1912
1913         /* So many TCP implementations out there (incorrectly) count the
1914          * initial SYN frame in their delayed-ACK and congestion control
1915          * algorithms that we must have the following bandaid to talk
1916          * efficiently to them.  -DaveM
1917          */
1918         tp->snd_cwnd = 2;
1919
1920         /* See draft-stevens-tcpca-spec-01 for discussion of the
1921          * initialization of these values.
1922          */
1923         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1924         tp->snd_cwnd_clamp = ~0;
1925         tp->mss_cache = 536;
1926
1927         tp->reordering = sysctl_tcp_reordering;
1928         tp->ca_ops = &tcp_init_congestion_ops;
1929
1930         sk->sk_state = TCP_CLOSE;
1931
1932         sk->sk_write_space = sk_stream_write_space;
1933         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1934
1935         tp->af_specific = &ipv4_specific;
1936
1937         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1938         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1939
1940         atomic_inc(&tcp_sockets_allocated);
1941
1942         return 0;
1943 }
1944
1945 int tcp_v4_destroy_sock(struct sock *sk)
1946 {
1947         struct tcp_sock *tp = tcp_sk(sk);
1948
1949         tcp_clear_xmit_timers(sk);
1950
1951         tcp_cleanup_congestion_control(tp);
1952
1953         /* Cleanup up the write buffer. */
1954         sk_stream_writequeue_purge(sk);
1955
1956         /* Cleans up our, hopefully empty, out_of_order_queue. */
1957         __skb_queue_purge(&tp->out_of_order_queue);
1958
1959         /* Clean prequeue, it must be empty really */
1960         __skb_queue_purge(&tp->ucopy.prequeue);
1961
1962         /* Clean up a referenced TCP bind bucket. */
1963         if (tp->bind_hash)
1964                 tcp_put_port(sk);
1965
1966         /*
1967          * If sendmsg cached page exists, toss it.
1968          */
1969         if (sk->sk_sndmsg_page) {
1970                 __free_page(sk->sk_sndmsg_page);
1971                 sk->sk_sndmsg_page = NULL;
1972         }
1973
1974         atomic_dec(&tcp_sockets_allocated);
1975
1976         return 0;
1977 }
1978
1979 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1980
1981 #ifdef CONFIG_PROC_FS
1982 /* Proc filesystem TCP sock list dumping. */
1983
1984 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1985 {
1986         return hlist_empty(head) ? NULL :
1987                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1988 }
1989
1990 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1991 {
1992         return tw->tw_node.next ?
1993                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1994 }
1995
1996 static void *listening_get_next(struct seq_file *seq, void *cur)
1997 {
1998         struct tcp_sock *tp;
1999         struct hlist_node *node;
2000         struct sock *sk = cur;
2001         struct tcp_iter_state* st = seq->private;
2002
2003         if (!sk) {
2004                 st->bucket = 0;
2005                 sk = sk_head(&tcp_listening_hash[0]);
2006                 goto get_sk;
2007         }
2008
2009         ++st->num;
2010
2011         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2012                 struct request_sock *req = cur;
2013
2014                 tp = tcp_sk(st->syn_wait_sk);
2015                 req = req->dl_next;
2016                 while (1) {
2017                         while (req) {
2018                                 if (req->rsk_ops->family == st->family) {
2019                                         cur = req;
2020                                         goto out;
2021                                 }
2022                                 req = req->dl_next;
2023                         }
2024                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2025                                 break;
2026 get_req:
2027                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2028                 }
2029                 sk        = sk_next(st->syn_wait_sk);
2030                 st->state = TCP_SEQ_STATE_LISTENING;
2031                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2032         } else {
2033                 tp = tcp_sk(sk);
2034                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2035                 if (reqsk_queue_len(&tp->accept_queue))
2036                         goto start_req;
2037                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2038                 sk = sk_next(sk);
2039         }
2040 get_sk:
2041         sk_for_each_from(sk, node) {
2042                 if (sk->sk_family == st->family) {
2043                         cur = sk;
2044                         goto out;
2045                 }
2046                 tp = tcp_sk(sk);
2047                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2048                 if (reqsk_queue_len(&tp->accept_queue)) {
2049 start_req:
2050                         st->uid         = sock_i_uid(sk);
2051                         st->syn_wait_sk = sk;
2052                         st->state       = TCP_SEQ_STATE_OPENREQ;
2053                         st->sbucket     = 0;
2054                         goto get_req;
2055                 }
2056                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2057         }
2058         if (++st->bucket < TCP_LHTABLE_SIZE) {
2059                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2060                 goto get_sk;
2061         }
2062         cur = NULL;
2063 out:
2064         return cur;
2065 }
2066
2067 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2068 {
2069         void *rc = listening_get_next(seq, NULL);
2070
2071         while (rc && *pos) {
2072                 rc = listening_get_next(seq, rc);
2073                 --*pos;
2074         }
2075         return rc;
2076 }
2077
2078 static void *established_get_first(struct seq_file *seq)
2079 {
2080         struct tcp_iter_state* st = seq->private;
2081         void *rc = NULL;
2082
2083         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2084                 struct sock *sk;
2085                 struct hlist_node *node;
2086                 struct tcp_tw_bucket *tw;
2087
2088                 /* We can reschedule _before_ having picked the target: */
2089                 cond_resched_softirq();
2090
2091                 read_lock(&tcp_ehash[st->bucket].lock);
2092                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2093                         if (sk->sk_family != st->family) {
2094                                 continue;
2095                         }
2096                         rc = sk;
2097                         goto out;
2098                 }
2099                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2100                 tw_for_each(tw, node,
2101                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2102                         if (tw->tw_family != st->family) {
2103                                 continue;
2104                         }
2105                         rc = tw;
2106                         goto out;
2107                 }
2108                 read_unlock(&tcp_ehash[st->bucket].lock);
2109                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2110         }
2111 out:
2112         return rc;
2113 }
2114
2115 static void *established_get_next(struct seq_file *seq, void *cur)
2116 {
2117         struct sock *sk = cur;
2118         struct tcp_tw_bucket *tw;
2119         struct hlist_node *node;
2120         struct tcp_iter_state* st = seq->private;
2121
2122         ++st->num;
2123
2124         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2125                 tw = cur;
2126                 tw = tw_next(tw);
2127 get_tw:
2128                 while (tw && tw->tw_family != st->family) {
2129                         tw = tw_next(tw);
2130                 }
2131                 if (tw) {
2132                         cur = tw;
2133                         goto out;
2134                 }
2135                 read_unlock(&tcp_ehash[st->bucket].lock);
2136                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2137
2138                 /* We can reschedule between buckets: */
2139                 cond_resched_softirq();
2140
2141                 if (++st->bucket < tcp_ehash_size) {
2142                         read_lock(&tcp_ehash[st->bucket].lock);
2143                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2144                 } else {
2145                         cur = NULL;
2146                         goto out;
2147                 }
2148         } else
2149                 sk = sk_next(sk);
2150
2151         sk_for_each_from(sk, node) {
2152                 if (sk->sk_family == st->family)
2153                         goto found;
2154         }
2155
2156         st->state = TCP_SEQ_STATE_TIME_WAIT;
2157         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2158         goto get_tw;
2159 found:
2160         cur = sk;
2161 out:
2162         return cur;
2163 }
2164
2165 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2166 {
2167         void *rc = established_get_first(seq);
2168
2169         while (rc && pos) {
2170                 rc = established_get_next(seq, rc);
2171                 --pos;
2172         }               
2173         return rc;
2174 }
2175
2176 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2177 {
2178         void *rc;
2179         struct tcp_iter_state* st = seq->private;
2180
2181         tcp_listen_lock();
2182         st->state = TCP_SEQ_STATE_LISTENING;
2183         rc        = listening_get_idx(seq, &pos);
2184
2185         if (!rc) {
2186                 tcp_listen_unlock();
2187                 local_bh_disable();
2188                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2189                 rc        = established_get_idx(seq, pos);
2190         }
2191
2192         return rc;
2193 }
2194
2195 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2196 {
2197         struct tcp_iter_state* st = seq->private;
2198         st->state = TCP_SEQ_STATE_LISTENING;
2199         st->num = 0;
2200         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2201 }
2202
2203 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2204 {
2205         void *rc = NULL;
2206         struct tcp_iter_state* st;
2207
2208         if (v == SEQ_START_TOKEN) {
2209                 rc = tcp_get_idx(seq, 0);
2210                 goto out;
2211         }
2212         st = seq->private;
2213
2214         switch (st->state) {
2215         case TCP_SEQ_STATE_OPENREQ:
2216         case TCP_SEQ_STATE_LISTENING:
2217                 rc = listening_get_next(seq, v);
2218                 if (!rc) {
2219                         tcp_listen_unlock();
2220                         local_bh_disable();
2221                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2222                         rc        = established_get_first(seq);
2223                 }
2224                 break;
2225         case TCP_SEQ_STATE_ESTABLISHED:
2226         case TCP_SEQ_STATE_TIME_WAIT:
2227                 rc = established_get_next(seq, v);
2228                 break;
2229         }
2230 out:
2231         ++*pos;
2232         return rc;
2233 }
2234
2235 static void tcp_seq_stop(struct seq_file *seq, void *v)
2236 {
2237         struct tcp_iter_state* st = seq->private;
2238
2239         switch (st->state) {
2240         case TCP_SEQ_STATE_OPENREQ:
2241                 if (v) {
2242                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2243                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2244                 }
2245         case TCP_SEQ_STATE_LISTENING:
2246                 if (v != SEQ_START_TOKEN)
2247                         tcp_listen_unlock();
2248                 break;
2249         case TCP_SEQ_STATE_TIME_WAIT:
2250         case TCP_SEQ_STATE_ESTABLISHED:
2251                 if (v)
2252                         read_unlock(&tcp_ehash[st->bucket].lock);
2253                 local_bh_enable();
2254                 break;
2255         }
2256 }
2257
2258 static int tcp_seq_open(struct inode *inode, struct file *file)
2259 {
2260         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2261         struct seq_file *seq;
2262         struct tcp_iter_state *s;
2263         int rc;
2264
2265         if (unlikely(afinfo == NULL))
2266                 return -EINVAL;
2267
2268         s = kmalloc(sizeof(*s), GFP_KERNEL);
2269         if (!s)
2270                 return -ENOMEM;
2271         memset(s, 0, sizeof(*s));
2272         s->family               = afinfo->family;
2273         s->seq_ops.start        = tcp_seq_start;
2274         s->seq_ops.next         = tcp_seq_next;
2275         s->seq_ops.show         = afinfo->seq_show;
2276         s->seq_ops.stop         = tcp_seq_stop;
2277
2278         rc = seq_open(file, &s->seq_ops);
2279         if (rc)
2280                 goto out_kfree;
2281         seq          = file->private_data;
2282         seq->private = s;
2283 out:
2284         return rc;
2285 out_kfree:
2286         kfree(s);
2287         goto out;
2288 }
2289
2290 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2291 {
2292         int rc = 0;
2293         struct proc_dir_entry *p;
2294
2295         if (!afinfo)
2296                 return -EINVAL;
2297         afinfo->seq_fops->owner         = afinfo->owner;
2298         afinfo->seq_fops->open          = tcp_seq_open;
2299         afinfo->seq_fops->read          = seq_read;
2300         afinfo->seq_fops->llseek        = seq_lseek;
2301         afinfo->seq_fops->release       = seq_release_private;
2302         
2303         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2304         if (p)
2305                 p->data = afinfo;
2306         else
2307                 rc = -ENOMEM;
2308         return rc;
2309 }
2310
2311 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2312 {
2313         if (!afinfo)
2314                 return;
2315         proc_net_remove(afinfo->name);
2316         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2317 }
2318
2319 static void get_openreq4(struct sock *sk, struct request_sock *req,
2320                          char *tmpbuf, int i, int uid)
2321 {
2322         const struct inet_request_sock *ireq = inet_rsk(req);
2323         int ttd = req->expires - jiffies;
2324
2325         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2326                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2327                 i,
2328                 ireq->loc_addr,
2329                 ntohs(inet_sk(sk)->sport),
2330                 ireq->rmt_addr,
2331                 ntohs(ireq->rmt_port),
2332                 TCP_SYN_RECV,
2333                 0, 0, /* could print option size, but that is af dependent. */
2334                 1,    /* timers active (only the expire timer) */
2335                 jiffies_to_clock_t(ttd),
2336                 req->retrans,
2337                 uid,
2338                 0,  /* non standard timer */
2339                 0, /* open_requests have no inode */
2340                 atomic_read(&sk->sk_refcnt),
2341                 req);
2342 }
2343
2344 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2345 {
2346         int timer_active;
2347         unsigned long timer_expires;
2348         struct tcp_sock *tp = tcp_sk(sp);
2349         struct inet_sock *inet = inet_sk(sp);
2350         unsigned int dest = inet->daddr;
2351         unsigned int src = inet->rcv_saddr;
2352         __u16 destp = ntohs(inet->dport);
2353         __u16 srcp = ntohs(inet->sport);
2354
2355         if (tp->pending == TCP_TIME_RETRANS) {
2356                 timer_active    = 1;
2357                 timer_expires   = tp->timeout;
2358         } else if (tp->pending == TCP_TIME_PROBE0) {
2359                 timer_active    = 4;
2360                 timer_expires   = tp->timeout;
2361         } else if (timer_pending(&sp->sk_timer)) {
2362                 timer_active    = 2;
2363                 timer_expires   = sp->sk_timer.expires;
2364         } else {
2365                 timer_active    = 0;
2366                 timer_expires = jiffies;
2367         }
2368
2369         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2370                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2371                 i, src, srcp, dest, destp, sp->sk_state,
2372                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2373                 timer_active,
2374                 jiffies_to_clock_t(timer_expires - jiffies),
2375                 tp->retransmits,
2376                 sock_i_uid(sp),
2377                 tp->probes_out,
2378                 sock_i_ino(sp),
2379                 atomic_read(&sp->sk_refcnt), sp,
2380                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2381                 tp->snd_cwnd,
2382                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2383 }
2384
2385 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2386 {
2387         unsigned int dest, src;
2388         __u16 destp, srcp;
2389         int ttd = tw->tw_ttd - jiffies;
2390
2391         if (ttd < 0)
2392                 ttd = 0;
2393
2394         dest  = tw->tw_daddr;
2395         src   = tw->tw_rcv_saddr;
2396         destp = ntohs(tw->tw_dport);
2397         srcp  = ntohs(tw->tw_sport);
2398
2399         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2400                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2401                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2402                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2403                 atomic_read(&tw->tw_refcnt), tw);
2404 }
2405
2406 #define TMPSZ 150
2407
2408 static int tcp4_seq_show(struct seq_file *seq, void *v)
2409 {
2410         struct tcp_iter_state* st;
2411         char tmpbuf[TMPSZ + 1];
2412
2413         if (v == SEQ_START_TOKEN) {
2414                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2415                            "  sl  local_address rem_address   st tx_queue "
2416                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2417                            "inode");
2418                 goto out;
2419         }
2420         st = seq->private;
2421
2422         switch (st->state) {
2423         case TCP_SEQ_STATE_LISTENING:
2424         case TCP_SEQ_STATE_ESTABLISHED:
2425                 get_tcp4_sock(v, tmpbuf, st->num);
2426                 break;
2427         case TCP_SEQ_STATE_OPENREQ:
2428                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2429                 break;
2430         case TCP_SEQ_STATE_TIME_WAIT:
2431                 get_timewait4_sock(v, tmpbuf, st->num);
2432                 break;
2433         }
2434         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2435 out:
2436         return 0;
2437 }
2438
2439 static struct file_operations tcp4_seq_fops;
2440 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2441         .owner          = THIS_MODULE,
2442         .name           = "tcp",
2443         .family         = AF_INET,
2444         .seq_show       = tcp4_seq_show,
2445         .seq_fops       = &tcp4_seq_fops,
2446 };
2447
2448 int __init tcp4_proc_init(void)
2449 {
2450         return tcp_proc_register(&tcp4_seq_afinfo);
2451 }
2452
2453 void tcp4_proc_exit(void)
2454 {
2455         tcp_proc_unregister(&tcp4_seq_afinfo);
2456 }
2457 #endif /* CONFIG_PROC_FS */
2458
2459 struct proto tcp_prot = {
2460         .name                   = "TCP",
2461         .owner                  = THIS_MODULE,
2462         .close                  = tcp_close,
2463         .connect                = tcp_v4_connect,
2464         .disconnect             = tcp_disconnect,
2465         .accept                 = tcp_accept,
2466         .ioctl                  = tcp_ioctl,
2467         .init                   = tcp_v4_init_sock,
2468         .destroy                = tcp_v4_destroy_sock,
2469         .shutdown               = tcp_shutdown,
2470         .setsockopt             = tcp_setsockopt,
2471         .getsockopt             = tcp_getsockopt,
2472         .sendmsg                = tcp_sendmsg,
2473         .recvmsg                = tcp_recvmsg,
2474         .backlog_rcv            = tcp_v4_do_rcv,
2475         .hash                   = tcp_v4_hash,
2476         .unhash                 = tcp_unhash,
2477         .get_port               = tcp_v4_get_port,
2478         .enter_memory_pressure  = tcp_enter_memory_pressure,
2479         .sockets_allocated      = &tcp_sockets_allocated,
2480         .memory_allocated       = &tcp_memory_allocated,
2481         .memory_pressure        = &tcp_memory_pressure,
2482         .sysctl_mem             = sysctl_tcp_mem,
2483         .sysctl_wmem            = sysctl_tcp_wmem,
2484         .sysctl_rmem            = sysctl_tcp_rmem,
2485         .max_header             = MAX_TCP_HEADER,
2486         .obj_size               = sizeof(struct tcp_sock),
2487         .rsk_prot               = &tcp_request_sock_ops,
2488 };
2489
2490
2491
2492 void __init tcp_v4_init(struct net_proto_family *ops)
2493 {
2494         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2495         if (err < 0)
2496                 panic("Failed to create the TCP control socket.\n");
2497         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2498         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2499
2500         /* Unhash it so that IP input processing does not even
2501          * see it, we do not wish this socket to see incoming
2502          * packets.
2503          */
2504         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2505 }
2506
2507 EXPORT_SYMBOL(ipv4_specific);
2508 EXPORT_SYMBOL(tcp_bind_hash);
2509 EXPORT_SYMBOL(tcp_bucket_create);
2510 EXPORT_SYMBOL(tcp_hashinfo);
2511 EXPORT_SYMBOL(tcp_inherit_port);
2512 EXPORT_SYMBOL(tcp_listen_wlock);
2513 EXPORT_SYMBOL(tcp_port_rover);
2514 EXPORT_SYMBOL(tcp_prot);
2515 EXPORT_SYMBOL(tcp_put_port);
2516 EXPORT_SYMBOL(tcp_unhash);
2517 EXPORT_SYMBOL(tcp_v4_conn_request);
2518 EXPORT_SYMBOL(tcp_v4_connect);
2519 EXPORT_SYMBOL(tcp_v4_do_rcv);
2520 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2521 EXPORT_SYMBOL(tcp_v4_send_check);
2522 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2523
2524 #ifdef CONFIG_PROC_FS
2525 EXPORT_SYMBOL(tcp_proc_register);
2526 EXPORT_SYMBOL(tcp_proc_unregister);
2527 #endif
2528 EXPORT_SYMBOL(sysctl_local_port_range);
2529 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2530 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2531