ipvs: Fix faulty IPv6 extension header handling in IPVS
[firefly-linux-kernel-4.4.55.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
1 /*
2  * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
13  *
14  *              Network name space (netns) aware.
15  *              Global data moved to netns i.e struct netns_ipvs
16  *              tcp_timeouts table has copy per netns in a hash table per
17  *              protocol ip_vs_proto_data and is handled by netns
18  */
19
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
23 #include <linux/kernel.h>
24 #include <linux/ip.h>
25 #include <linux/tcp.h>                  /* for tcphdr */
26 #include <net/ip.h>
27 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
28 #include <net/ip6_checksum.h>
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31
32 #include <net/ip_vs.h>
33
34 static int
35 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
36                   int *verdict, struct ip_vs_conn **cpp)
37 {
38         struct net *net;
39         struct ip_vs_service *svc;
40         struct tcphdr _tcph, *th;
41         struct ip_vs_iphdr iph;
42
43         ip_vs_fill_iph_skb(af, skb, &iph);
44
45         th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
46         if (th == NULL) {
47                 *verdict = NF_DROP;
48                 return 0;
49         }
50         net = skb_net(skb);
51         /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
52         if (th->syn &&
53             (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
54                                      &iph.daddr, th->dest))) {
55                 int ignored;
56
57                 if (ip_vs_todrop(net_ipvs(net))) {
58                         /*
59                          * It seems that we are very loaded.
60                          * We have to drop this packet :(
61                          */
62                         ip_vs_service_put(svc);
63                         *verdict = NF_DROP;
64                         return 0;
65                 }
66
67                 /*
68                  * Let the virtual server select a real server for the
69                  * incoming connection, and create a connection entry.
70                  */
71                 *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
72                 if (!*cpp && ignored <= 0) {
73                         if (!ignored)
74                                 *verdict = ip_vs_leave(svc, skb, pd);
75                         else {
76                                 ip_vs_service_put(svc);
77                                 *verdict = NF_DROP;
78                         }
79                         return 0;
80                 }
81                 ip_vs_service_put(svc);
82         }
83         /* NF_ACCEPT */
84         return 1;
85 }
86
87
88 static inline void
89 tcp_fast_csum_update(int af, struct tcphdr *tcph,
90                      const union nf_inet_addr *oldip,
91                      const union nf_inet_addr *newip,
92                      __be16 oldport, __be16 newport)
93 {
94 #ifdef CONFIG_IP_VS_IPV6
95         if (af == AF_INET6)
96                 tcph->check =
97                         csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
98                                          ip_vs_check_diff2(oldport, newport,
99                                                 ~csum_unfold(tcph->check))));
100         else
101 #endif
102         tcph->check =
103                 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
104                                  ip_vs_check_diff2(oldport, newport,
105                                                 ~csum_unfold(tcph->check))));
106 }
107
108
109 static inline void
110 tcp_partial_csum_update(int af, struct tcphdr *tcph,
111                      const union nf_inet_addr *oldip,
112                      const union nf_inet_addr *newip,
113                      __be16 oldlen, __be16 newlen)
114 {
115 #ifdef CONFIG_IP_VS_IPV6
116         if (af == AF_INET6)
117                 tcph->check =
118                         ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
119                                          ip_vs_check_diff2(oldlen, newlen,
120                                                 csum_unfold(tcph->check))));
121         else
122 #endif
123         tcph->check =
124                 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
125                                 ip_vs_check_diff2(oldlen, newlen,
126                                                 csum_unfold(tcph->check))));
127 }
128
129
130 static int
131 tcp_snat_handler(struct sk_buff *skb,
132                  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
133 {
134         struct tcphdr *tcph;
135         unsigned int tcphoff;
136         int oldlen;
137         int payload_csum = 0;
138
139         struct ip_vs_iphdr iph;
140         ip_vs_fill_iph_skb(cp->af, skb, &iph);
141         tcphoff = iph.len;
142
143 #ifdef CONFIG_IP_VS_IPV6
144         if (cp->af == AF_INET6 && iph.fragoffs)
145                 return 1;
146 #endif
147         oldlen = skb->len - tcphoff;
148
149         /* csum_check requires unshared skb */
150         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
151                 return 0;
152
153         if (unlikely(cp->app != NULL)) {
154                 int ret;
155
156                 /* Some checks before mangling */
157                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
158                         return 0;
159
160                 /* Call application helper if needed */
161                 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
162                         return 0;
163                 /* ret=2: csum update is needed after payload mangling */
164                 if (ret == 1)
165                         oldlen = skb->len - tcphoff;
166                 else
167                         payload_csum = 1;
168         }
169
170         tcph = (void *)skb_network_header(skb) + tcphoff;
171         tcph->source = cp->vport;
172
173         /* Adjust TCP checksums */
174         if (skb->ip_summed == CHECKSUM_PARTIAL) {
175                 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
176                                         htons(oldlen),
177                                         htons(skb->len - tcphoff));
178         } else if (!payload_csum) {
179                 /* Only port and addr are changed, do fast csum update */
180                 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
181                                      cp->dport, cp->vport);
182                 if (skb->ip_summed == CHECKSUM_COMPLETE)
183                         skb->ip_summed = (cp->app && pp->csum_check) ?
184                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
185         } else {
186                 /* full checksum calculation */
187                 tcph->check = 0;
188                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
189 #ifdef CONFIG_IP_VS_IPV6
190                 if (cp->af == AF_INET6)
191                         tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
192                                                       &cp->caddr.in6,
193                                                       skb->len - tcphoff,
194                                                       cp->protocol, skb->csum);
195                 else
196 #endif
197                         tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
198                                                         cp->caddr.ip,
199                                                         skb->len - tcphoff,
200                                                         cp->protocol,
201                                                         skb->csum);
202                 skb->ip_summed = CHECKSUM_UNNECESSARY;
203
204                 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
205                           pp->name, tcph->check,
206                           (char*)&(tcph->check) - (char*)tcph);
207         }
208         return 1;
209 }
210
211
212 static int
213 tcp_dnat_handler(struct sk_buff *skb,
214                  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
215 {
216         struct tcphdr *tcph;
217         unsigned int tcphoff;
218         int oldlen;
219         int payload_csum = 0;
220
221         struct ip_vs_iphdr iph;
222         ip_vs_fill_iph_skb(cp->af, skb, &iph);
223         tcphoff = iph.len;
224
225 #ifdef CONFIG_IP_VS_IPV6
226         if (cp->af == AF_INET6 && iph.fragoffs)
227                 return 1;
228 #endif
229         oldlen = skb->len - tcphoff;
230
231         /* csum_check requires unshared skb */
232         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
233                 return 0;
234
235         if (unlikely(cp->app != NULL)) {
236                 int ret;
237
238                 /* Some checks before mangling */
239                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
240                         return 0;
241
242                 /*
243                  *      Attempt ip_vs_app call.
244                  *      It will fix ip_vs_conn and iph ack_seq stuff
245                  */
246                 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
247                         return 0;
248                 /* ret=2: csum update is needed after payload mangling */
249                 if (ret == 1)
250                         oldlen = skb->len - tcphoff;
251                 else
252                         payload_csum = 1;
253         }
254
255         tcph = (void *)skb_network_header(skb) + tcphoff;
256         tcph->dest = cp->dport;
257
258         /*
259          *      Adjust TCP checksums
260          */
261         if (skb->ip_summed == CHECKSUM_PARTIAL) {
262                 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
263                                         htons(oldlen),
264                                         htons(skb->len - tcphoff));
265         } else if (!payload_csum) {
266                 /* Only port and addr are changed, do fast csum update */
267                 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
268                                      cp->vport, cp->dport);
269                 if (skb->ip_summed == CHECKSUM_COMPLETE)
270                         skb->ip_summed = (cp->app && pp->csum_check) ?
271                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
272         } else {
273                 /* full checksum calculation */
274                 tcph->check = 0;
275                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
276 #ifdef CONFIG_IP_VS_IPV6
277                 if (cp->af == AF_INET6)
278                         tcph->check = csum_ipv6_magic(&cp->caddr.in6,
279                                                       &cp->daddr.in6,
280                                                       skb->len - tcphoff,
281                                                       cp->protocol, skb->csum);
282                 else
283 #endif
284                         tcph->check = csum_tcpudp_magic(cp->caddr.ip,
285                                                         cp->daddr.ip,
286                                                         skb->len - tcphoff,
287                                                         cp->protocol,
288                                                         skb->csum);
289                 skb->ip_summed = CHECKSUM_UNNECESSARY;
290         }
291         return 1;
292 }
293
294
295 static int
296 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
297 {
298         unsigned int tcphoff;
299
300 #ifdef CONFIG_IP_VS_IPV6
301         if (af == AF_INET6)
302                 tcphoff = sizeof(struct ipv6hdr);
303         else
304 #endif
305                 tcphoff = ip_hdrlen(skb);
306
307         switch (skb->ip_summed) {
308         case CHECKSUM_NONE:
309                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
310         case CHECKSUM_COMPLETE:
311 #ifdef CONFIG_IP_VS_IPV6
312                 if (af == AF_INET6) {
313                         if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
314                                             &ipv6_hdr(skb)->daddr,
315                                             skb->len - tcphoff,
316                                             ipv6_hdr(skb)->nexthdr,
317                                             skb->csum)) {
318                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
319                                                  "Failed checksum for");
320                                 return 0;
321                         }
322                 } else
323 #endif
324                         if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
325                                               ip_hdr(skb)->daddr,
326                                               skb->len - tcphoff,
327                                               ip_hdr(skb)->protocol,
328                                               skb->csum)) {
329                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
330                                                  "Failed checksum for");
331                                 return 0;
332                         }
333                 break;
334         default:
335                 /* No need to checksum. */
336                 break;
337         }
338
339         return 1;
340 }
341
342
343 #define TCP_DIR_INPUT           0
344 #define TCP_DIR_OUTPUT          4
345 #define TCP_DIR_INPUT_ONLY      8
346
347 static const int tcp_state_off[IP_VS_DIR_LAST] = {
348         [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
349         [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
350         [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
351 };
352
353 /*
354  *      Timeout table[state]
355  */
356 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
357         [IP_VS_TCP_S_NONE]              =       2*HZ,
358         [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
359         [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
360         [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
361         [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
362         [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
363         [IP_VS_TCP_S_CLOSE]             =       10*HZ,
364         [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
365         [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
366         [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
367         [IP_VS_TCP_S_SYNACK]            =       120*HZ,
368         [IP_VS_TCP_S_LAST]              =       2*HZ,
369 };
370
371 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
372         [IP_VS_TCP_S_NONE]              =       "NONE",
373         [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
374         [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
375         [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
376         [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
377         [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
378         [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
379         [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
380         [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
381         [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
382         [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
383         [IP_VS_TCP_S_LAST]              =       "BUG!",
384 };
385
386 #define sNO IP_VS_TCP_S_NONE
387 #define sES IP_VS_TCP_S_ESTABLISHED
388 #define sSS IP_VS_TCP_S_SYN_SENT
389 #define sSR IP_VS_TCP_S_SYN_RECV
390 #define sFW IP_VS_TCP_S_FIN_WAIT
391 #define sTW IP_VS_TCP_S_TIME_WAIT
392 #define sCL IP_VS_TCP_S_CLOSE
393 #define sCW IP_VS_TCP_S_CLOSE_WAIT
394 #define sLA IP_VS_TCP_S_LAST_ACK
395 #define sLI IP_VS_TCP_S_LISTEN
396 #define sSA IP_VS_TCP_S_SYNACK
397
398 struct tcp_states_t {
399         int next_state[IP_VS_TCP_S_LAST];
400 };
401
402 static const char * tcp_state_name(int state)
403 {
404         if (state >= IP_VS_TCP_S_LAST)
405                 return "ERR!";
406         return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
407 }
408
409 static struct tcp_states_t tcp_states [] = {
410 /*      INPUT */
411 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
412 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
413 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
414 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
415 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
416
417 /*      OUTPUT */
418 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
419 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
420 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
421 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
422 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
423
424 /*      INPUT-ONLY */
425 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
426 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
427 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
428 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
429 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
430 };
431
432 static struct tcp_states_t tcp_states_dos [] = {
433 /*      INPUT */
434 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
435 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
436 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
437 /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
438 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
439
440 /*      OUTPUT */
441 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
442 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
443 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
444 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
445 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
446
447 /*      INPUT-ONLY */
448 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
449 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
450 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
451 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
452 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
453 };
454
455 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
456 {
457         int on = (flags & 1);           /* secure_tcp */
458
459         /*
460         ** FIXME: change secure_tcp to independent sysctl var
461         ** or make it per-service or per-app because it is valid
462         ** for most if not for all of the applications. Something
463         ** like "capabilities" (flags) for each object.
464         */
465         pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
466 }
467
468 static inline int tcp_state_idx(struct tcphdr *th)
469 {
470         if (th->rst)
471                 return 3;
472         if (th->syn)
473                 return 0;
474         if (th->fin)
475                 return 1;
476         if (th->ack)
477                 return 2;
478         return -1;
479 }
480
481 static inline void
482 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
483               int direction, struct tcphdr *th)
484 {
485         int state_idx;
486         int new_state = IP_VS_TCP_S_CLOSE;
487         int state_off = tcp_state_off[direction];
488
489         /*
490          *    Update state offset to INPUT_ONLY if necessary
491          *    or delete NO_OUTPUT flag if output packet detected
492          */
493         if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
494                 if (state_off == TCP_DIR_OUTPUT)
495                         cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
496                 else
497                         state_off = TCP_DIR_INPUT_ONLY;
498         }
499
500         if ((state_idx = tcp_state_idx(th)) < 0) {
501                 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
502                 goto tcp_state_out;
503         }
504
505         new_state =
506                 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
507
508   tcp_state_out:
509         if (new_state != cp->state) {
510                 struct ip_vs_dest *dest = cp->dest;
511
512                 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
513                               "%s:%d state: %s->%s conn->refcnt:%d\n",
514                               pd->pp->name,
515                               ((state_off == TCP_DIR_OUTPUT) ?
516                                "output " : "input "),
517                               th->syn ? 'S' : '.',
518                               th->fin ? 'F' : '.',
519                               th->ack ? 'A' : '.',
520                               th->rst ? 'R' : '.',
521                               IP_VS_DBG_ADDR(cp->af, &cp->daddr),
522                               ntohs(cp->dport),
523                               IP_VS_DBG_ADDR(cp->af, &cp->caddr),
524                               ntohs(cp->cport),
525                               tcp_state_name(cp->state),
526                               tcp_state_name(new_state),
527                               atomic_read(&cp->refcnt));
528
529                 if (dest) {
530                         if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
531                             (new_state != IP_VS_TCP_S_ESTABLISHED)) {
532                                 atomic_dec(&dest->activeconns);
533                                 atomic_inc(&dest->inactconns);
534                                 cp->flags |= IP_VS_CONN_F_INACTIVE;
535                         } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
536                                    (new_state == IP_VS_TCP_S_ESTABLISHED)) {
537                                 atomic_inc(&dest->activeconns);
538                                 atomic_dec(&dest->inactconns);
539                                 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
540                         }
541                 }
542         }
543
544         if (likely(pd))
545                 cp->timeout = pd->timeout_table[cp->state = new_state];
546         else    /* What to do ? */
547                 cp->timeout = tcp_timeouts[cp->state = new_state];
548 }
549
550 /*
551  *      Handle state transitions
552  */
553 static void
554 tcp_state_transition(struct ip_vs_conn *cp, int direction,
555                      const struct sk_buff *skb,
556                      struct ip_vs_proto_data *pd)
557 {
558         struct tcphdr _tcph, *th;
559
560 #ifdef CONFIG_IP_VS_IPV6
561         int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
562 #else
563         int ihl = ip_hdrlen(skb);
564 #endif
565
566         th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
567         if (th == NULL)
568                 return;
569
570         spin_lock(&cp->lock);
571         set_tcp_state(pd, cp, direction, th);
572         spin_unlock(&cp->lock);
573 }
574
575 static inline __u16 tcp_app_hashkey(__be16 port)
576 {
577         return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
578                 & TCP_APP_TAB_MASK;
579 }
580
581
582 static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
583 {
584         struct ip_vs_app *i;
585         __u16 hash;
586         __be16 port = inc->port;
587         int ret = 0;
588         struct netns_ipvs *ipvs = net_ipvs(net);
589         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
590
591         hash = tcp_app_hashkey(port);
592
593         spin_lock_bh(&ipvs->tcp_app_lock);
594         list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
595                 if (i->port == port) {
596                         ret = -EEXIST;
597                         goto out;
598                 }
599         }
600         list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
601         atomic_inc(&pd->appcnt);
602
603   out:
604         spin_unlock_bh(&ipvs->tcp_app_lock);
605         return ret;
606 }
607
608
609 static void
610 tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
611 {
612         struct netns_ipvs *ipvs = net_ipvs(net);
613         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
614
615         spin_lock_bh(&ipvs->tcp_app_lock);
616         atomic_dec(&pd->appcnt);
617         list_del(&inc->p_list);
618         spin_unlock_bh(&ipvs->tcp_app_lock);
619 }
620
621
622 static int
623 tcp_app_conn_bind(struct ip_vs_conn *cp)
624 {
625         struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
626         int hash;
627         struct ip_vs_app *inc;
628         int result = 0;
629
630         /* Default binding: bind app only for NAT */
631         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
632                 return 0;
633
634         /* Lookup application incarnations and bind the right one */
635         hash = tcp_app_hashkey(cp->vport);
636
637         spin_lock(&ipvs->tcp_app_lock);
638         list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
639                 if (inc->port == cp->vport) {
640                         if (unlikely(!ip_vs_app_inc_get(inc)))
641                                 break;
642                         spin_unlock(&ipvs->tcp_app_lock);
643
644                         IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
645                                       "%s:%u to app %s on port %u\n",
646                                       __func__,
647                                       IP_VS_DBG_ADDR(cp->af, &cp->caddr),
648                                       ntohs(cp->cport),
649                                       IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
650                                       ntohs(cp->vport),
651                                       inc->name, ntohs(inc->port));
652
653                         cp->app = inc;
654                         if (inc->init_conn)
655                                 result = inc->init_conn(inc, cp);
656                         goto out;
657                 }
658         }
659         spin_unlock(&ipvs->tcp_app_lock);
660
661   out:
662         return result;
663 }
664
665
666 /*
667  *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
668  */
669 void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
670 {
671         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
672
673         spin_lock(&cp->lock);
674         cp->state = IP_VS_TCP_S_LISTEN;
675         cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
676                            : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
677         spin_unlock(&cp->lock);
678 }
679
680 /* ---------------------------------------------
681  *   timeouts is netns related now.
682  * ---------------------------------------------
683  */
684 static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
685 {
686         struct netns_ipvs *ipvs = net_ipvs(net);
687
688         ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
689         spin_lock_init(&ipvs->tcp_app_lock);
690         pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
691                                                         sizeof(tcp_timeouts));
692         if (!pd->timeout_table)
693                 return -ENOMEM;
694         pd->tcp_state_table =  tcp_states;
695         return 0;
696 }
697
698 static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
699 {
700         kfree(pd->timeout_table);
701 }
702
703
704 struct ip_vs_protocol ip_vs_protocol_tcp = {
705         .name =                 "TCP",
706         .protocol =             IPPROTO_TCP,
707         .num_states =           IP_VS_TCP_S_LAST,
708         .dont_defrag =          0,
709         .init =                 NULL,
710         .exit =                 NULL,
711         .init_netns =           __ip_vs_tcp_init,
712         .exit_netns =           __ip_vs_tcp_exit,
713         .register_app =         tcp_register_app,
714         .unregister_app =       tcp_unregister_app,
715         .conn_schedule =        tcp_conn_schedule,
716         .conn_in_get =          ip_vs_conn_in_get_proto,
717         .conn_out_get =         ip_vs_conn_out_get_proto,
718         .snat_handler =         tcp_snat_handler,
719         .dnat_handler =         tcp_dnat_handler,
720         .csum_check =           tcp_csum_check,
721         .state_name =           tcp_state_name,
722         .state_transition =     tcp_state_transition,
723         .app_conn_bind =        tcp_app_conn_bind,
724         .debug_packet =         ip_vs_tcpudp_debug_packet,
725         .timeout_change =       tcp_timeout_change,
726 };